From 31017e1ec483e7d83c0d42070d0585cf6348e7fc Mon Sep 17 00:00:00 2001
From: bing <2524698668@qq.com>
Date: Sat, 11 May 2024 22:02:52 +0800
Subject: [PATCH] utils for punctuation and emotion and speaker ver

---
 takway/stt/__init__.py                        |   1 +
 .../stt/__pycache__/__init__.cpython-39.pyc   | Bin 0 -> 172 bytes
 .../stt/__pycache__/base_stt.cpython-39.pyc   | Bin 0 -> 2382 bytes
 .../__pycache__/emotion_utils.cpython-39.pyc  | Bin 0 -> 3525 bytes
 .../__pycache__/funasr_utils.cpython-39.pyc   | Bin 0 -> 4491 bytes
 .../modified_funasr.cpython-39.pyc            | Bin 0 -> 4507 bytes
 .../punctuation_utils.cpython-39.pyc          | Bin 0 -> 2717 bytes
 .../speaker_ver_utils.cpython-39.pyc          | Bin 0 -> 2855 bytes
 takway/stt/base_stt.py                        |  65 ++++++
 takway/stt/emotion_utils.py                   | 142 +++++++++++++
 takway/stt/funasr_utils.py                    | 186 ++++++++++++++++++
 takway/stt/modified_funasr.py                 | 168 ++++++++++++++++
 takway/stt/punctuation_utils.py               | 119 +++++++++++
 takway/stt/speaker_ver_utils.py               |  86 ++++++++
 takway/stt/vosk_utils.py                      | 120 +++++++++++
 15 files changed, 887 insertions(+)
 create mode 100644 takway/stt/__init__.py
 create mode 100644 takway/stt/__pycache__/__init__.cpython-39.pyc
 create mode 100644 takway/stt/__pycache__/base_stt.cpython-39.pyc
 create mode 100644 takway/stt/__pycache__/emotion_utils.cpython-39.pyc
 create mode 100644 takway/stt/__pycache__/funasr_utils.cpython-39.pyc
 create mode 100644 takway/stt/__pycache__/modified_funasr.cpython-39.pyc
 create mode 100644 takway/stt/__pycache__/punctuation_utils.cpython-39.pyc
 create mode 100644 takway/stt/__pycache__/speaker_ver_utils.cpython-39.pyc
 create mode 100644 takway/stt/base_stt.py
 create mode 100644 takway/stt/emotion_utils.py
 create mode 100644 takway/stt/funasr_utils.py
 create mode 100644 takway/stt/modified_funasr.py
 create mode 100644 takway/stt/punctuation_utils.py
 create mode 100644 takway/stt/speaker_ver_utils.py
 create mode 100644 takway/stt/vosk_utils.py
diff --git a/takway/stt/__init__.py b/takway/stt/__init__.py
new file mode 100644
index 0000000..413fa1f
--- /dev/null
+++ b/takway/stt/__init__.py
@@ -0,0 +1 @@
+from .base_stt import *
\ No newline at end of file
diff --git a/takway/stt/__pycache__/__init__.cpython-39.pyc b/takway/stt/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22e74bb9fefea84f6a7232083b5a17148b00b8ba
GIT binary patch
literal 172
zcmYe~<>g`k0;3A+G;tvP7{oyaOhAqU5Et_Ri4=wu#vFzah7_h?22JLdj6fkx##@Y9
zewvI?97&1Asqw`nB`X<<n1K>t;+KVsRZKx;Nk)ENOlDD8W?p(sNn&<+Vr4u?wQg==
nW*$@^2B<S8K0Y%qvm`!Vub}c4hfQvNN@-529mtl?K+FID1<fi%

literal 0
HcmV?d00001

diff --git a/takway/stt/__pycache__/base_stt.cpython-39.pyc b/takway/stt/__pycache__/base_stt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac43531750114dbeb9c565a4ccf9384193c59112
GIT binary patch
literal 2382
zcmZuyOK;mo5Z+x<5-rP);-q=H0h>b+0gl=lZ4o3znkG%!UQ{44P*gBN&{|4nB9qGQ
zGL8uuDC(RF6#W530r}X!wAY?;?k%T&vy%KmNpSWtvoky2w>ylNmg)@8`kQ<GD+`SM
zLxYo#hrt7U`U!+&k`GyT+VU=UW6>3^^}3#G{jM*C^oEt4YPZVScTD=S@|MZUf#}vG
zf5uwXcksnJ?*wFv+onu<xlAH$)cV$*Npz3YC!eEDJj86mC@_MacaAy2R%PuiL)btr
zpsmQdTtr)yD{@ILzvT#fL7tOM^a3fcfs3~GY-8hLtdn=Y+{aFy^!yXys810e;nS-S
zV?JgP8+(xGO~hCn(hSmtFWzbS)_?lV#$#KN$yPD2^--*J61~uQ)~&@_MZNxjY;-d0
zt1A5205j1jHruL(;onRxqIrMk+q;{iJ+qx>o2h!8W`j)=zub-YqAhIn_D-B;vx!Y@
z%qC3&GSeRI*&vG2EHzO?el&$(yo$GM&eCkp4SIz6{Gmhxd@j-7(2;B(T)JcdalzHt
z@G+O1`25}b<J!Jw#KaqWhkRfB1ZyAffyY_a{EX=bpEB;=pCQL$;)5P`D9D?sOscXn
z=6|xk03Dx>t7G<p&1bSYY-+Mb+A&cSbSzh8r7->5cW_36B*Pg2Cf$YY+(f&%lDfP!
z%wriM#juxW{d5ra(_wNqEEm7eP3TN8SnuR#O~R4NU#BvWYhj_`Gn7d`E{102rrio`
zfGEy`_484ZS)bUnJ_WSiAThQ!%m)LkwSJoQb6eYuRfb{9SGYw?(WUBopxN@iV|OYV
zq$9|i`1C~x#(fYK@Fu^-mmx2TIxkn|v>l(MQ^XO8M4lt=IXVQT<ONerIRzZF8F3)g
zn$vifaS1OwVzV|;YCPljN&oan?@d6^A)i?V2812(UqL=y*z!UwWjMnmG<i5owdq*)
zo`<$Jn=Y@<^*e<&;Z_pjLWj9>wym0?lx;l);M&AlFR>o<fE1o=#T27q0+jPKvkM!0
zqvWwtxoU|in%A-P42pIUJBnrzKZIT{LolFM<#oQom&FxPUYO%}!uAx9bj^=|G#D6M
zcEbToBa^OAAl;N4ryk(!^Q_}$eawDo0N4S|-F0)-7d6a2H^F29cSanFeH4mG9pB<1
zb8A;jLwaV3R?)d9c*g{<U*79&C%u<pT*x#JWo+Wm0D~~jB%R6Ei85ho2zOT>!?Yh}
zNiWelR(tJ#DQ_#(R{4cnAG!5&4%~vZITdN%en=JehacC%t-^!@T$I+bq9SK2&Q#q%
zUxg$HRYeF{k>Z5U7FvHd#)i(*d|f56?5lVu(YB6Tkxl#RBU1X5Ld$au+WPU<tk^w!
zC2Z5>A5D$iI(X2RAc#=`MjQA@Hh5F4@bb!>*|Q4~HhE)uwC(vr1Rs%n3j@@1ah0K%
zp{6^sS3$6FWevq`;-TE}Ln=aEd1I7oGg1!h5KTVF(lU|gY7~sNx(avd8U!vrp?D0R
zt*K-*jC;wfB-rY74Sq)8H%g`YgzTweAJ+!kq}@j+g5H5%Q^jB+a1b1)e%AC<(gXm<
z_XwY+?A=8c<?2xqr%qcQlDCfA$&53#hE=xmsz{3D>^2vUynIdm-A_f7=jWsxY2m5@
zgU*u{cj?mKa)EwV8O}@;*?JW1<gyr2zY#^Rig-BHs3k0Ol~UcLUYo=^iO)$Kqvm?9
ztWuIxb~TNFRV7#o8o{}GO_8P*s1-Dd_*Wl8SdnY>0d*R^d}k-mqQay@?GC(K@qa;V
T@YQs`_X*!wqyXic-iG)OP)$OX

literal 0
HcmV?d00001

diff --git a/takway/stt/__pycache__/emotion_utils.cpython-39.pyc b/takway/stt/__pycache__/emotion_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e19cdb0d6bfbb1e020e67f18f6fa8c0f603c3aa
GIT binary patch
literal 3525
zcmZ`6O_SS5wYw$Dvd81`c#>>(VJVRm0aFu`WJ6f8Aw`x<AbglACMg#0WK|k<d!iXx
za$2oP?9yDAQ~`&D;<}ry@>Frjzu?S)1INB{qH1s4!smM}dp?q^T>aGQe(&|`_w8VD
zu}<Lm=<S<>c!iL^<HzyGhL7*V$o>j|6Ha5&FW<D!%(vCI%-inU@Mdvk==2?0;#B)p
zZgHDexbw{FyD+bE_ZjQgc#YSel79V%gfH;MQ^FhjtiQk)c@tI}Vv#THF!>kW>NiD0
zH1}!0$(Q>}qBd>uwpfbj0i{GN3`j&o?N9Xgw7)E>)9L}~x5e^wMKlj+|I}>dv^XP9
zi}qQvk2Gh*DSUIfPC6_9fIE?1hw9p`T%|utxrnth$de$GFHS{K_$@I^Rg@-Ic17rK
z2AO!_>|W_!>R#<wx;{j7UyVm%J|6|@p>CGzQtU<<kWRkxIzIj&N`%T2!8=SkAcU*}
z2r0@ldvIQX@dFsymjM)@(wxW!C+kF!DJ{s9aZtnoUWN50m0S%jz6+8$DCxDUy0&@k
zDi<M$sU4Bv8*XazKn2f<Tm+6UzIXfj#%Qb_rpZPmccWx$Lj^mJgR!~m#bFR7vxSXJ
zsg0R3{9HwG)*X#?tK{xyN?zF<E0KK(WyR|d=2Dx=C76ZydHgx_s7tUaNS|^7p0tcU
z@|1!vfro$(*fS5Q@HSuKE!c7RBHBz>*Gi*E3+N?s({`BiK%%xn1oH<a!SA2I$j$*M
zKxsZGa+|`86;@#n?1&T<^dR(D;7@xqi|BfkL@Em6Xd<3F+Rj8gI5L%XI2bq-(gn8K
z0j0^U%qX=4O<Z<;A9?sbo}Hnjr0UQ~Yj%O|eESulQE(&lHJB8XQwWB0iVR38G-uDO
zQfRmiO<Ia^tDv6}ZhvO&(_e!^;UbluQ4sh;#dxGV4ZMuFle2k}<)cw5mEfM)^w5pH
z4<~Ou=z3`qkG&7eD0twBCnFI8UbkaO1|0O_+zkw&Ug@Rk?r;=~p-2F2P0CbCNF=3)
z9VtV$VeKM}(mx!RzQrc(xh9TO9pWML2Q7?hGcmv*A{1yt(S0Txi8ONt^K+a|;r<77
zkAlA<h7GINRHdN%xb`En=4@AC^&{m@YoJkf!1h?drgaz#2gJ~N2CuQb2zYVQR!wfH
zCFJob`HZ1n%|4#~iE3=O;GD+xGPgfwaK^jGX#ZT$?KbZ1Sn_WLovz^AoPazlui;sq
z`S}d@)FG~O61TueuTI_zABu3t3vwQ%9)@#iy&&OUm?pbIDlbx=N<9;kUNrC$5sEAe
z<hc7!RIO%mbMk?q_YM6p&y=?*N`h4{jxx3CK~%4LFF#S*eIFzJ?$5ncdPzPUjlDq1
zV7%&W=E~bjl?O?t!qJ~5m%r$UFQlD(<259&Kjkz8Xci@z3X)Lhin*({QFLu@r%|FU
zxPZw(nnkIu&cxRC<6u{4I~|Ec*QE&fKn6pR>H3yX$!yM&k*+3?Qs82`b`S1i{94;(
zwjaoJxS0<ILh4GCsLR*n+o)A1$jp9q5T}89=L$s3DO|4~-AS{@gO{^J%b3$Ik&wZ6
zfhfb|NUS<#v`!l^5H@HF24gmL{^!D8+pINN+iFmNbaHkcPcO67wOlD7QB`+(_d=7R
z@c?+R{Np#qS|$NNU4a-vAH+mya?dwm#<n5G3L7&JXCK>pP6CNjZew}hV!tDWbDKQq
z?p1$72Gpc-x2R6-19BV6?w(t?u-n|L!CIxLVex+S!K3Si!>ey%u5u@z*N=PJk;1N*
z>1d|W!)yf38iYZf2`>)DLMEvko?sxJC>eyx7sE}#ITW_*UIu3O4!<!fl+48msx()l
zT=_hbGjuHI-T)l^knSWtWJ-TMOr@BKDWRsz1Z3KbMdn@bGTh5vIcuJn>qiIbg)Oj7
z9tSc~W4-)}{MrGgBF?l^@;uQhV9LZaiD@@G3t$V3_@D&_q`xJ*RN4joBYR}8AIr%d
zrdjurraRARzk!8b!SHv)_$XuR`VEYxFddF=j)1Ae>OMI?cd?O7F{yNK#Az7B*-e<t
zu_g8{9Y?-{ROnmsO#s^65()UAg6M;TGHs30QKxQVL|#TLtakD{2)+%VU&l)1oBL`9
z^q9v=TkzIRB==S7OQ=PV@7_ZaY-r>S1g7>@XY%!7CBrZz4Gf|)6`Q*7I|HLhCrdBg
zrehy<6>=GwVgCTDSK$Sb)2U)nPat9VzOMT|)WAH(dBgV~<w0ET$TyG@3tp*$+c-1w
z9RU<lwg7w;hB2DU2sOWd*B9J|dw%xyFN1Z$-e_y=J4{*pDNM?C5Sl^uhgG!tlpPQ#
zZ&()Dwhf_dsS1Y@M#(Rs%dz3NZ&8x8E|cu$rP(>Zf_Gs-Fj_r{?dGI?%M_SF6hoAy
z0mmM|kza!?c>#c~LBbpbkUTT_b^NZ*vu9-_!Jj&|b|KY5ryC1-1reMQ8#-*0agS-l
zSZoOonWaG_+69`-{5b(jxsd8ERAmqOfCbQFeSgyKZipuVb{`v1lT;VZGggCMQAE3;
z=)57%BSpoO5ZQsbtRXPBz>HV+G`5O{qF=vrzjtf>7w_L%U(?NUe&=p)?cV*~nyDkk
z5Jm*KhFqN}fpn3X#!5T+2ui6aJ)LIK*!&ivQ6+zjCmq?|%*>tL8>I)`#E8kz$<R6R
h*PyP$Ap5ibI1PaRbt<mYyzeezUk!sbjZ*3c^Z&TkhS~rC

literal 0
HcmV?d00001

diff --git a/takway/stt/__pycache__/funasr_utils.cpython-39.pyc b/takway/stt/__pycache__/funasr_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de72d3206d62137e0e876f9e31bf842f32521eda
GIT binary patch
literal 4491
zcmaJ^&2t<_74Po(*xA`#X(cO?B|E9fM};iN!U<GxIj#^Vj*<`%5Gy1ZL?)x@UhS-Q
zXVyKvwzV}&Qjw|TfT$G3iH|Bd#U(cm9608{U%-LG6bA}Uq9`t0QmF#-UXQ-4*v!_v
z?SB2b=k<H9fA4kJXt)HPr*1ye{`rH1{2e>T9}7Cq!JGXH2q&EOh+qCwpXxsI4ftn0
zvv2tp#WAC2_bYxyx6Pi@uliL=ULxG$_ASC~QQbCtSJZfAlgWR8ywj`q8-9Zl(co3?
z^4bpbCwSeT6cf8mF?pN%Q^MUQ{<JtHX2jGs<&D!sOmrVOLwIA3Y*T+$%<%~^`z!Jr
zwoPH3(_(g;ULila;LnSBF^lq>%wG^w(1Uq1M9kyNvi%b_Py91tdiSh2BNlEm|G{g&
z<Mb+NP42;-$x4eBO)jF8iy%|-sr4--GT>D&4^{dd;P#+Xxw^Xge3*#{XB>Z={Z95A
zGz3AL`HY|XfV3&@^;Q+ESL5|t#5Z|^Pr$#$CwUY8Z9c=N`1CF6S3V#n@g30UfnsiX
znBa%64upt0uYp<{Njwtr^SMDN!*(kBLY^CS&Vi;P?8nJQVMaq9?!EUc>|}ZGuauBI
z6I9uQ28pUb4jzmH^mYy2?598!*(P0@8@ZWVobDKuDAp!9%^7FEW_k6u^k+0Dfst2$
znz;ki%583dS6|*B-250v?c^-*Ea0sH&)(-bs=_PV^nH-%+T58ZygEZb+Tlp6F;X`t
zyS3W{qX>5HuC$E8eC_h;i-i-lH-bT^I)%l>`f#Jge5;RsC~PjS$B~e@t(GBii+kt*
ziF=FRWJg?N?BQ-lPreg%hRJ5o&))H}PCD#yZ(Vp_zi{45rT6st^NU`Xa8GRwgcoOC
zoTw#<7K%SLV6}Q-Wy67xg%bpE600DPz$QgC%w$mNG$#@a9T}9L>80srupUO6#atOv
zUmS2t$qd|DxIYMcL-C@NsVt^TKKNP1VK30*wQ$JeG>F2eBg(JtH%oOujh0)Ondr3(
zSL>q<CnrHkSx3@9G6BSQ4(uRjkv9jVus3gnawGc$qCE(rUYKP;@Co_+(v^#AgDurb
zleJi0hag*1AxLh)#xilPAI8Z(V=Yr^tvyW8977fNvZcY6oPm{5fouT?ai`fVv#D)3
zv`*n=4sBBCn<MW&p}urr>j)fr_@Lb}uwQ^T`!f&)p{nRMll4UEHiPyfobDPqy-mjy
zbVLagxKAr9r`yPTS@Y~}1-Nf3XSbSD&Tg|Y%h|3AF9f8X>uI@qw*hj!WLiydLrvlu
zB?P@s^*Db0m#W@v!guQ3DQ@mC_{N1}b3e^#cN)hw4f&^>?w-Q7{sOLP-Mwaxpk$Ap
z0bfU(p7y-QLlu_MiD4S0$#o%>7b{Pto<=G!ZhMJ{M3#keYw6!$7*a5~K6<l^96i53
z%oIkwp0Mck;!G`ifFO(B$tO;io=5z9<-2%1l3{<a<%Lp)TZ`WMP<b1v@&FAjj^}0c
z<o!Nz|G6`T8)tEnsW6E|VQGa56BR1VZW<?r0cuI~d%@zgsO&3Km^Z@fqA=5eND3E_
zkGEym7g^zM2$k%&Su!Xp3D@ct)m6~2JXa<vTexkR_Sc8)Hk@V)j`@>M%P-+)>@dS|
zfRl%?>mKc?U^m+GQDBzjF&u7c-vp%)&ig8Gvo8Z7%%zOF5ah0=bqMo1Wz3}Z9mjC#
zH1z6pf=$!W6ZZ@JlZ!`)a0d?>1;I`qfbT&=0eW{ChX4kYHK1p5W5*mL=2~4FaGZ22
z+yn~9-L1mwa%;y%yan{FX$-EYI=3;SY7DNd#N@p@cj7zm0`xI}6-+#6q~%X+wAo;&
z3QOaU@5EWqj+3w_Ps8{TqKsw`CcVp7U-L8^c|1+TQmY}q24aORA&Ap{QNe%kfkcFm
z7mz%F<IaXigbbCCXOVaJ(((+7SOW<YTTFY)pVXjy<gyd)l2GxGSK!Ss9}sq$dI&vq
z^zi*Wa0sh+VGkw}e6{H>K?713U<;mOFe^9#Py|~BXFJB2abs-c49b({*u?b8&3W<-
zGB&O~o*Qt&t<hO|kk{`z<*BE{O%*SK*C8Cg3E%PS>!FG|Iu`_4QR$@WMk;wh3KK<J
zrbKPD@gyo&IQtS=%hDE-7m;gAkqvuFzJwhEIw$N@R1X#UK8R$~Kro&ih-mY_wq-(V
z;{RCfaCU?UeehuZ!BncP_D9&1A{zvX3koO+VYy2WEeHk2kjN)6WCNBw4whtKO9RRV
zW7Cu?%IsRm%jpK#7Eu=Ea+Y<DB%x(*64}Ykt3*{4hCGyt+&T@u>e^?Zh#XrvlOJ*$
zd}GL8K*?i)Pb|*HwyNfqE+_vvuE5yuq2Pg0EGZ!`K=ISHXUN!rk-r^|5c!w9!mT;I
z`V2`JuZ-<v0mng$&%QW@`alU~V=+3ejvj-)+pXnzEbBQQ;0C-C@J@1P#~!<HkmNa&
zsOCp>E1_guqh$2Bn&Q>Gc2ESrLEeJ0d|dl+?b_~9J8?_id2*gW<hk=?cN$hbmAl%W
zy7;~g2mab_&OV~@V+4<YX)vfKv$+ktKY%=gqK=`irKMq|`XyKoqPG#A0aqS++}kf#
zyfVbxgM@O}xD55h#ls#BA05|=`zh>XsySFwOX?JO*8+e&BH=JOO?!*pjgC+qAw3|+
zPZJb#$N@nwZF`}Yi9|uEChl4(gi1T$Lfvz{6qZwIKU8rP_Ig_$$cYFN@9|adp&?+C
zK4*5R=87&;A!(Z7%{bGw<?7xYcqCNpWK=&Y055uaB8XNnYX>ox-v*J^OkwgkQsq%E
zK``rtP?c&-twBu?0c0i!n~0}{0RUMTJ;;<6;OvGtf`@y;>O9PmS8$>o4j{ksqKViN
z4AM*uz|T-wwCYD-qID|j$1PV>^#oM1m{h5P;<c<bp?>fyup(^bW<g`W3VB~D-K%H%
zf>R30Z{g>uI0;Lq4sSWZVZgl&qS-PKQmX?*0Z185I?HCM$xbuVG$G+Sh6$+_Ul*vy
z7R(8Q(Z?VmPtZr`Lk6Y*2WrwLNRGa8zf^FKZ?_z2!-r)B3FhLJWqlXX_Ydvb7F%9!
zRq?uzKz$Mfg&W{KVc5fVJqWH1!(KU}%jD9hn5E>aNN{sSW4{!SM6U-Ijz@7olZUZ|
zWr)ONghhRZHH*XG5_SCd*@WjTIF2(3-2Z4<R0zxVAb{wq;zjl{-YZsPEYu-jG1nB<
z3opOE^6IGB-_q8WkfAr0pzhKe8%>>X>_xBjkUWR;OcnQ`mea+ej!FMid0kkF(tbZp
z%A0{M>U_JcFAMq>btndxh$XyP!1Y^6%ph848Q|j6`%9Ieq3;-y;{c?wsHnDMu&ll-
zPWIA}?-ypsg+l{}s4(DaBCq0_t4IVAAITLYuOs;(5<F>g4GH={zJ=t-^16kVFPuxI
nJD<fs$esY=Kyt*pO`X+Q^IbBp+Yo^+vyBCaI4t|l!vFsOl$D&>

literal 0
HcmV?d00001

diff --git a/takway/stt/__pycache__/modified_funasr.cpython-39.pyc b/takway/stt/__pycache__/modified_funasr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6a7b297ed55c527ac3cfdf4b10600b0fb3b2432
GIT binary patch
literal 4507
zcmb6dU2hx5ad+>7$1j<Z9ZQZ?hfQ1;Ok*{1zfjjrqsC&BG*Jvmb`Vy^;ly1@M;-6z
zy`>!y$DpWOBrhub)R&@w1oWYKZht_Y`vdy0=tF@%`5zQUTX$xUq9|J}+9PgmcV~8X
zcV>1z7*wkz0?!`~Z*2Z$o{+y`XY#S3^A@~m1b`DxBhsZUO0mr%*0owz*KXOSZ$(bm
zZMoe-t6=(e<aLX!Vz<;Pb<3@?8FQjas{(VKsM@NU?^Nr|PY8E;;UVD#LHDdyjXSHP
z;XQ^GNrURyW<Or+t7Nq&M0<Nhw3Dqk90=K9y88Co+REbcYV*UD50+MR`Bpz}tA3!u
zBnIqM^Y-%M>Pqw7<)yXT%S*bj)J+V^(xs)9rPXUNe&?E=zPZ#~eE;^Ezq)v5$-lL@
z_KrTA_C&BFq<>$?&9H44F1K+%lMl)UpY$C7g0v_nEymA$Mm8z8xc#u$vbe+Dhooim
z0{7tS@FFk4*X3nifp3A=c$H5*q%H3=Vv|-8&ONPXK1g`D847-U*0pB)Xf<3B+o(4S
z-ZTcF$R6oXPC0{>3Ab@wTyx)}B%>MeS;hcbnFY|!sQj^VHc3d7`#bt8#1}Fq?<miV
z6-Q&G%#s~dHe;3180ZZ4SX>^=_5vAfCbBE!)xq}FR7nwZ!+1;E?LH5}pT9{+GyE$h
z<S}mcF*HD>y3|d$i2PolwzbQ}{je>xy|TErq`mg$RzB`<vC-dxf$e_0<9Aa%E8;dx
zlYTxDCCQGz5wv&oEC*va1<{z&PTKDYsXgBh<52m&F7{L5_s~9c8Ji-v1$_oP6Y7?@
zU5SF#wO;?ontZcw=_1ZSTy->mgeW4SfQab{UOBR1YdaOurkq2f3kY5W@CDUH-)~1j
zn)?11<jIe2USIF+s_i6R59NK3<hlxW9t68^)KpyU24OrJSWlH&hy556@i+T%kjjPL
zuABv?xR3M-0Ajn8o&5dXExHtb%%(M(zk`d<o7%#W$l46~j2>*``QC;%-3E}64oRmJ
z*xVld1<mLI<Lr>iy0JDZzG+4*7+F_#$2o9S!F;`p9@2v%SSdS%pRq~ODIJt?UdK9Q
zl*FE@a2vTATXUA56yu~GwA&(02lhhT+XeU_xIeg*#L;dY6x~ats{afzck79S7GUo#
zG^$3<`s{L|-tG1x(G{^0d`Zd#3`4{`eGq~fPN%{BV+$~r$&ystre$}MFqR&iS9_?3
zIOqypkfIj_Z6Ut`<E<%J?3(Cq2+l!isV=VV_Jmoa!Q>^Rs2-Wb8$Qs=ufkXkUPit%
zXrN&bflE8GL!?O00U)3+kCsO7%v0BzVS}%oqO<3DJf}UhYoopILgS?N6zGAoJ(ku%
zpO(=l=%Ql-m6K6!?=$Y~Fu3>bvU~+*OSG`TOCVAZN&R^3`skqb%}}PQ!CFRUaMd>S
zSO(Wn2HM&cX~Q-8k(Ys)E^G&Bpp-N+)#VAlYZe=cQZC@?&j_%5Bwm>GO~9u!0LGGI
zCCZ-K@D47WBDaaDCj^Ff9A6`_Poa^K18}5=bckk3<*NW$2b?Jg39bMLi0v@>Dz{cq
zv`)uGu|ccAoqcP_xI45mc9Yz#4Q+725DqSo?~$Q(uL(4s8~`kKz}<3hmSBk3X9s2O
zarcnHJmA?VFMmPJ;WdksCzTJ?wvhE(A^{bIuYqrESlS8WUSH|TMxff;ern7K#G0rF
ziR7ASTZvE96Z%TCL0i?J*Mke-gNqCk-l*tWdt0=3{GiXn#8+UzxoXhB<r@f4<GKi*
zCiiLD+Y+&mffCvqEs^H_?IIEudNRT5D!&1}hGVXQF6G>LpaShmk@h1cuOqbuooAI=
zlt*#?`#_SSjSyO5E?kCOoi?+X2M*5umuuN@L8joU%(KD#DM}u*I-z3pXyZG95&ZvC
zvAh9Fa}Nq=6stseCp?z?7VwkbM(`a3cxf6%c>~+%`;C<!3pNi5kbV*Q76Lr&k;O0K
zz%v#<JwDM<7<(V6((?eG7m`aWY;gI%iRFmNi4>y(yvp#J8?X$*`rpN)oGp?ubQ@sp
zASCYeiG0Td1~<NfBXBR=3y}2f*fLgLr-)-2-GY0<3;P%p+>E84A5%iBv_s_QnGK>Y
zs|sTFDEH>@wv{fDJvwwCf%!cz;(d{S%pC5){c^Z7bX7HT_UKLWh~B+CECBY;Spo5L
zWQPqIw1LApGW0U*5tZH;2lNuKcq4O)nDNrkjpvbK2eI|%vAD0y%iLeG3uIUvT!#Ks
z=S+s@t6>7xnTB@;-m|>C?+!~Jk@zi}sB@3#?yo5sLW(@NqF&&Yto$4DTeb&j_DAGn
zNS24?Ps{hdJZ{IOeC&1dI_~Z~?B-A4RjL=r!7TLWIu`)Gn3ar_A;AQh+WgEu1ZYa`
zeK|pSf@Xwe!0jL0K`n&9-LJ*xGxrgbxNkQDZQ|*29Ra7>jeG}8zLznedT)X|?1N9|
zD<53s(|iU(KI{Os*f<+p1iv*l<T|927zOLOV<^=}&$zKDx6<q59%7H($@Tiit`aH6
z{<z=m?bZV+gWcE0JPl<X9nT2{Vd{eugvm(~y;gs)EsR$Mkh3-u>LCP07y-h1vRM!6
zkOL^lN5u(F0hA{`7wS}gkh=i73q?dbh#+ABI?+}qxt_AaXM70w1LG{+eAmkoq=(g!
z9efg|vI?EG$4}vbsEPqzJM92`y7qv4GmL{sm-6fx1B1leLcRcCP`MLC{XAW{UN<WM
z&jO0d4|#*icagG*;0FlM88_ztqa)Kc4>7^cRg>?)Nw<TZfVkDu<#rN7F$Gx^#EqGJ
zKO?=HfTgW4R@#b0tQ{G|TSER27ll${6nhLu`BNMmA3>LnkD#5N1QsR|X*UmY5+joF
ztGb3E(C;Ow>Ve^n<5R`NCuuw_9#vN+^{I<ymO0oNm8c*Z)gfSNx`0tM<ez97>SC(E
zhE1;mc?Hy}IiW-ss;PL^i=Q3WO#b*9d<>-NIsoE58|*5O$#_<co@eK&&E}YGLkNTs
z8$zE=D-al6`ZAlhrz}P<Lxw*^Uxu*wD!b%h*6%^+tO0~kYjEY1XmHAA8rE8Kt-+ej
zMhOck%zdGdD*2cK^doFneE(iQh;oc6K;%`}yS#?rCW0jdZ2-DDDt6i;ihO@U7%w3S
zros|STFkXg2InzC;rGv}Id94<d1t+<!9#sKn-@k4PzNpK<&WQ2VU$V?>8<$*tf}|%
z{{u!O7OsO~zcn{OcT~*fY%uXNrQQi%6RF~e7uG|jHjt+L`#N;B)#Dr5F<waCLOh;n
e;;<%?<6*UTBNyYFc#+d702R<sjh+Lo;qTu&LY(se

literal 0
HcmV?d00001

diff --git a/takway/stt/__pycache__/punctuation_utils.cpython-39.pyc b/takway/stt/__pycache__/punctuation_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1655d9ea8c7fa0e95340cd76ea24b4ccd82d9522
GIT binary patch
literal 2717
zcmZuz|8E>e6`$E}w|BlfUu=`sX+cMRfve&xgg_-^O0Bd(CFEK;j#O<OEt{P=d*i*`
z&FrjOpVm@{qX?xW<p-)tG%XIQP+Ap)`U_E^|H%H@*lB+1PyAwm;JsO&dk)NLX5X86
z@6Ei=d-LYJMx#ccJ^$`}|3^m%`2v~oW&ra7^z;K57-2Ld1L{zUv=(Xu-O&}ThsMBk
zOiES=GyX^&i<!(~6=vVjoeHb6+CFXnjIaq-zfM?vOLMBc#u{sy_&sYn6MSO3&L;0t
zr@>lIliS;q><DiL^e*hit9=p>ZhuH`Q>VpS+edf<B&YZ^pW4FHOr0m4=|^xz(xF9l
zA(QdBnDG!8I|&jV1`!9^T=LSjv}5nl69w%S^rw6D+nEkZU;X1Z|HSVXSFi2!f7t*2
zs<8T5<fUS-;>)=ti~K!n{bcuh-BTUCs10xvS8gPHL?@nHEt=)L;Ojve#L-?8RHg8u
zv>%HB7sH<ff&VPpat|-K!__%In)BnqAdcqNJ?0ATg)$iMx%JrhE}eW1e?Gc7-VTB^
z53`&&lo3Z`M=%7}siPzG*NJ0b7~pG`0e}!S0@Bx5lTCtDg|z^}sbb<)IF`&j3Fj7=
z8)5>6!tgWZi8_$J3B#yaKp=k>dU_m&oaV$OIk`-M(sDgF`bI!<69b2d0(;P2M-vT@
zEowm&$iNGOA^*rKjFg9cwIG_fYB7M%=VDSZYj1dBCH;tqCxP8tc=5-}$%b5wqvb%X
z2hqy1^gv?6z2v2QZr}yc{={-B<uW3@?~<FzAWXZ-Mq#@yZtuF^0)r_+NGcW`&W!or
z9f|yp&*-=E*#RP_j6xD0m!wZhpP6<?FMS3yXmkoBbdWeBwLX#hWdo!PW*m}fNt2m5
zeUC8feSM2wgBSu<m7a<Lq?)NxN*sm#6e6Mp6wDhEQd&GUnjSuxMQN5Kv5=g#l};Pe
zv;F$8bFtfwqj01BdYRDq;oQ5}=5Hq42eod;5*ipRYAQv`jq!E}M5&k<gR_|K#q#Ar
z67m6$K=madVj*Tw7UNV7vq(+D@LlMssyCKRk5QXGtqqTk<Nb9$_;`R(LmIa}4L!XI
z1LPN!B0%&733(>9ElnJ~k}FyV`UKOqjT{QcCWTp(>C%#b>3igT4UvuJmA(P`r4H&Z
z=X9l%M~i4>_5bQZv>`5L8JB_#k4yfhY!9+jwlDE^Dn&aM?J!8?iT0&TwpU`=E<;rr
z<$M}Msq`YB7dnU+1_~F(<v56n>Jo%XVF+;r5-!R&z+I<`o4CxA-ea&9lm06A*Ic!Z
z8b3scj=@0m8m&<cdR?1Qw2eH_!>7g}e@y=n?uhvT?k9m^lxYFo)G(iC$&2LT)0=v(
zGi^;5tz2VtmgM>qa0eKjCs$wwkTQBufdyA56m75po`EyUkB*_-D@@;mV-(ehzu_uU
zbU^TNSDPz5;=+@>l=%gSl*t5%REIuAhsVY}J1l<2M^8qr0>pu5w}IKe!*vGH&B-?K
zm_1m>z&lNJfzq~hHETrAsCm56ujP6`!26$rj*;u2qoX9$KAjo2v`vH2O*1zDsn({I
zn>jp-v*bClX<qqdu8EsDROneE&2_5i3a;cPpl0_JR%HQJH=T-S;Ft=SNB@B)s=&fU
z{SMAzH5K24p|D3@ixbGUUcJyey>ww=>E%~??zx4<vz>~9T-hlaUXt*Lxd|6R%+v;j
zC3u>J(owgZGZnW&hsnpoQAtGX^E5pV`=nSyiD6R%a;!$1&@tPZz}vK`wO|}hkH=<T
zw+9(;@WzNKu%Vuqj*Znh@&?>>i<;{~1<FFC>#lnx^TKjPV4ADCQi6aEiyz?db?H%*
zsz31Oh9E*IR#00bRQ>){tJw|v3A?IM?cV}raA$l!TsKRB-Mx8p=g;px{Nzs$?ti-b
z){UKC-*|BUGrX}oZ-1OWx_a&5@9sVL?DpYGx_j^L&IfPhJ9qBwzVp}JTYuU4@cP5w
z-gvZi547{0zkT}f?{9-yH1g%=pMbfYTkkx$|M9^#1<gw+@T9%YOo<TIKxanP;?$25
z-W~m|Nd=bXVs>n)eB=CB)fuU(ws;oxtuq&T3+ES$#_6S{#f9GaGp{V3dudVmEl!~o
zD~OUzriGa%VIUm?{(_|H%_*!bf%?N8coTey5#eD8_V-cGOhgcsmwF6+1irB6%kVu<
iflKiTsKa%wYqkxT1L{_N|7$hRLIRmJNSdZfq5lH3&CY@V

literal 0
HcmV?d00001

diff --git a/takway/stt/__pycache__/speaker_ver_utils.cpython-39.pyc b/takway/stt/__pycache__/speaker_ver_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95b3df3d25ce613862b5a2b73863fa7f2ce415a1
GIT binary patch
literal 2855
zcma)8&2JM&6yMn|ZyY;>Bv4YISd<<XVT0WY6^dFSNZLx3K~2cPvbx%MCh;cw;m)iL
zv78HVO%J{FQdNo63x7~AbLpXns{aE0P~V&NCI&&%S?$}Ioj0>HzxSKpTfbZ`YEXXu
z>CWa4Gn)1rHYQ5}8h7Cq--DqVC7~9P79m*IL%n6Fr`a;q(`s4NpeD7bO$$GoEt@*6
z!si;DqQ&PLEk4s*4x6IWJ39Z4Ip9;;C#@nYZckI4mgx*VMQ7>heXCVsWjgl^zQW3^
zG=8GjJY4{vGy4X<qx!1mp8W&XrMW~py`aa!AZE>%3!=yT9mcBz#)HkE<EKFqzr0HQ
zD7g+@tkd-vXQC0abkOj`z#Dd}o!G12+^KewC<0%5P_Nyn)mL0onyD{#WHCx83%%Hn
zn4HP#oDBj26U$S=A25$a8;sH*-V(#B_iwj*yJ<Ix+W{ZIt9I(|Job0J4PUTo<Ogw^
zJ@KPn$V3NMKUpkj+=W}*2E()#p;}9)bBO120%*@GED7K}O-s);z#DLbrwzyecHmQ>
z4&ZV|&W};>0EHX`ZosVEM1voaW~Wd6m5vSdykH&gdvJ>jU{Y;FM*7GYnUw6CBhAxB
z7Fbvqd`54&CP#N^DO9+Fb(;b<9p>s>hbEr_<J!^`EZkJi99gAYc{z7@q485tf4Our
zco~{)k*4j5J$n5#2t}>8D;>`Z;vn_BS(s1&t6@hv8DD=n*6_l_upF9NKB(D2kn(~s
zAzliOkDxNrwgHeWLJ8IP^*v)`P-9V}=AyP|j&!Q^3BQq&105hU4|SrAOn}L1uDu|#
znDo<LKSioY6O7;lCUgyjlGNiKKm=5BEskrJ<$RTO@3B9SQ;KFiKkZ5z*xL`&)`SO4
zr|`=RZd}g5sKzlZFTy*a4m+78CE~z6yqtexf+Mw8hjkEvykK;X$ibn3Xow0}f+X1j
zngB`kJ!@o9eGv%qN)IT*zM=XoK+FcjZ1BT+0qTWRKQK~r+d8m;YR;iPvK4+7odORv
zo1$E?38&piyBv-tdy=jJ%g)0}_&k~gG>GaSXoklC)nXt<Aq#+T3yPB&JI~;dvtXp2
z2x%&S&Fv(JT}Q=%zk_{8Gc~ec!1tNxCL!hLaQq^eR%wi8T}3!=HL&%1Y)W`4gH6AK
z(sTs|3k;b?Y%)h2eF0)qB4^34@}~Gq4(%W}^Ma=eqoug|4OCL?0OE2;_6WoQ7<t56
z6A;c>Yk`*m(X$TqRNFQWEF3YeS%+jnJCAX)N5+oH*QvD%k%Jo#f1$a?=ioEPs7qs)
z31D*K+8l9}W|;Ui<F`SRRNA1>ez+ygIN=e5K@r_ST~sJt#@XISgMe}bMixa71)<M_
zboX@>&rd?(ZSo|-Suz>E4r8;kPl}50SAg=SKEw=w$zf6!YyUCN?-yJ2lV2bF^4Z%}
z5}_V}9>UQ-H<o~NDWuA0<SD?f-rt0Sf&Di}DkP|;hXx#vOfc5Rz3b46!U4Sy4iJvi
zc$(|%@Wz8U+^xin(Ml4tO0ro=yDYj^$-%26{D>|}F(7#r*Zr{1mN`#2uOXrzAfofl
zB>fW8PsHMsQPm9r!&vZZXpr&zI2MZGD9#h&GCiRfx1eh*v$Fv5ESZ6ORv%t^6Vwyq
zK<H3vkMj|qhUSV(njo0XCdUvqT?h3X6R+pVqUS-H>4#V^dEU2uKg@dgX`JX?G`PiV
zvkj~ruUpmdm~m7=Y>D^4IJ!p0*Pq35vEWQQ)5mX=FRlR7w&%G-DaWvS?{@o<U|h5}
zFu&hV9>>UZktd|O=X=>N1<L7(?u}R6)yDtNUj$}^hma5e$hXC9XjH4!;m0Q@)qJje
z2VNZ|^Td_cy^V>Df)*dq#+{K1>YqS#k{+w&zh@#-11s)Fy<H^|{1QY#8a=upi_2dx
zuQr;?YwHbp`u_6D(pQhxyw#=kW$)qA+7~Jo{6qA1lzaw^a}{J%3u*Ou5T|bnxODDh
lwD}aXg20o22@ozl|0}Q^Q=hrzfL1u!pTgg^r980p&%d6m)|mhR

literal 0
HcmV?d00001

diff --git a/takway/stt/base_stt.py b/takway/stt/base_stt.py
new file mode 100644
index 0000000..4763446
--- /dev/null
+++ b/takway/stt/base_stt.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import json
+import wave
+import io
+import os
+import logging
+from ..common_utils import decode_str2bytes
+
+class STTBase:
+    def __init__(self, RATE=16000, cfg_path=None, debug=False):
+        self.RATE = RATE
+        self.debug = debug
+        self.asr_cfg = self.parse_json(cfg_path)
+        
+    def parse_json(self, cfg_path):
+        cfg = None
+        self.hotwords = None
+        if cfg_path is not None:
+            with open(cfg_path, 'r', encoding='utf-8') as f:
+                cfg = json.load(f)
+            self.hotwords = cfg.get('hot_words', None)
+            logging.info(f"load STT config file: {cfg_path}")
+            logging.info(f"Hot words: {self.hotwords}")
+        else:
+            logging.warning("No STT config file provided, using default config.")
+        return cfg
+
+    def add_hotword(self, hotword):
+        """add hotword to list"""
+        if self.hotwords is None:
+            self.hotwords = ""
+        if isinstance(hotword, str):
+            self.hotwords = self.hotwords + " " + "hotword"
+        elif isinstance(hotword, (list, tuple)):
+            # 将hotwords转换为str，并用空格隔开
+            self.hotwords = self.hotwords + " " + " ".join(hotword)
+        else:
+            raise TypeError("hotword must be str or list")
+
+    def check_audio_type(self, audio_data):
+        """check audio data type and convert it to bytes if necessary."""
+        if isinstance(audio_data, bytes):
+            pass
+        elif isinstance(audio_data, list):
+            audio_data = b''.join(audio_data)
+        elif isinstance(audio_data, str):
+            audio_data = decode_str2bytes(audio_data)
+        elif isinstance(audio_data, io.BytesIO):
+            wf = wave.open(audio_data, 'rb')
+            audio_data = wf.readframes(wf.getnframes())
+        else:
+            raise TypeError(f"audio_data must be bytes, str or io.BytesIO, but got {type(audio_data)}")
+        return audio_data
+    
+    def text_postprecess(self, result, data_id='text'):
+        """postprecess recognized result."""
+        text = result[data_id]
+        if isinstance(text, list):
+            text = ''.join(text)
+        return text.replace(' ', '')
+
+    def recognize(self, audio_data, queue=None):
+        """recognize audio data to text"""
+        pass
diff --git a/takway/stt/emotion_utils.py b/takway/stt/emotion_utils.py
new file mode 100644
index 0000000..8d38423
--- /dev/null
+++ b/takway/stt/emotion_utils.py
@@ -0,0 +1,142 @@
+import io
+import numpy as np
+import base64
+import wave
+from funasr import AutoModel
+import time
+"""
+    Base模型
+        不能进行情绪分类,只能用作特征提取
+"""
+FUNASRBASE = {
+    "model_type": "funasr",
+    "model_path": "iic/emotion2vec_base",
+    "model_revision": "v2.0.4"
+}
+
+
+"""
+    Finetune模型
+        输出分类结果
+"""
+FUNASRFINETUNE = {
+    "model_type": "funasr",
+    "model_path": "iic/emotion2vec_base_finetuned"
+}
+
+def decode_str2bytes(data):
+    # 将Base64编码的字节串解码为字节串
+    if data is None:
+        return None
+    return base64.b64decode(data.encode('utf-8'))
+
+class Emotion:
+    def __init__(self,
+                 model_type="funasr",
+                 model_path="iic/emotion2vec_base",
+                 device="cuda",
+                 model_revision="v2.0.4",
+                 **kwargs):
+        
+        self.model_type = model_type
+        self.initialize(model_type, model_path, device, model_revision, **kwargs)
+    
+    # 初始化模型
+    def initialize(self,
+                   model_type,
+                   model_path,
+                   device,
+                   model_revision,
+                   **kwargs):
+        if model_type == "funasr":
+            self.emotion_model = AutoModel(model=model_path, device=device, model_revision=model_revision, **kwargs)
+        else:
+            raise NotImplementedError(f"unsupported model type [{model_type}]. only [funasr] expected.")
+
+    # 检查输入类型
+    def check_audio_type(self,
+                         audio_data):
+        """check audio data type and convert it to bytes if necessary."""
+        if isinstance(audio_data, bytes):
+            pass
+        elif isinstance(audio_data, list):
+            audio_data = b''.join(audio_data)
+        elif isinstance(audio_data, str):
+            audio_data = decode_str2bytes(audio_data)
+        elif isinstance(audio_data, io.BytesIO):
+            wf = wave.open(audio_data, 'rb')
+            audio_data = wf.readframes(wf.getnframes())
+        elif isinstance(audio_data, np.ndarray):
+            pass
+        else:
+            raise TypeError(f"audio_data must be bytes, list, str, \
+                io.BytesIO or numpy array, but got {type(audio_data)}")
+        
+        if isinstance(audio_data, bytes):
+            audio_data = np.frombuffer(audio_data, dtype=np.int16)
+        elif isinstance(audio_data, np.ndarray):
+            if audio_data.dtype != np.int16:
+                audio_data = audio_data.astype(np.int16)
+        else:
+            raise TypeError(f"audio_data must be bytes or numpy array, but got {type(audio_data)}")
+        
+        # 输入类型必须是float32
+        if isinstance(audio_data, np.ndarray):
+            audio_data = audio_data.astype(np.float32)
+        else:
+            raise TypeError(f"audio_data must be numpy array, but got {type(audio_data)}")
+        return audio_data
+    
+    def process(self,
+                audio_data,
+                granularity="utterance",
+                extract_embedding=False,
+                output_dir=None,
+                only_score=True):
+        """
+            audio_data: only float32 expected beacause layernorm
+            extract_embedding: save embedding if true
+            output_dir: save path for embedding
+            only_Score: only return lables & scores if true
+        """
+        audio_data = self.check_audio_type(audio_data)
+        if self.model_type == 'funasr':
+            result = self.emotion_model.generate(audio_data, output_dir=output_dir, granularity=granularity, extract_embedding=extract_embedding)
+        else:
+            pass
+
+        # 只保留 lables 和 scores
+        if only_score:
+            maintain_key = ["labels", "scores"]
+            for res in result:
+                keys_to_remove = [k for k in res.keys() if k not in maintain_key]
+                for k in keys_to_remove:
+                    res.pop(k)
+        return result[0]
+
+# only for test
+def load_audio_file(wav_file):
+    with wave.open(wav_file, 'rb') as wf:
+        params = wf.getparams()
+        frames = wf.readframes(params.nframes)
+        print("Audio file loaded.")
+        # Audio Parameters
+        # print("Channels:", params.nchannels)
+        # print("Sample width:", params.sampwidth)
+        # print("Frame rate:", params.framerate)
+        # print("Number of frames:", params.nframes)
+        # print("Compression type:", params.comptype)
+    return frames
+
+if __name__ == "__main__":
+    inputs = r".\example\test.wav"
+    inputs = load_audio_file(inputs)
+    device = "cuda"
+    # FUNASRBASE.update({"device": device})
+    FUNASRFINETUNE.update({"deivce": device})
+    emotion_model = Emotion(**FUNASRFINETUNE)
+    s = time.time()
+    result = emotion_model.process(inputs)
+    t = time.time()
+    print(t - s)
+    print(result)
\ No newline at end of file
diff --git a/takway/stt/funasr_utils.py b/takway/stt/funasr_utils.py
new file mode 100644
index 0000000..92eb41b
--- /dev/null
+++ b/takway/stt/funasr_utils.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ####################################################### #
+# FunAutoSpeechRecognizer: https://github.com/alibaba-damo-academy/FunASR
+# ####################################################### #
+import io
+import time
+import numpy as np
+from takway.common_utils import decode_str2bytes
+from funasr import AutoModel
+
+from takway.stt.base_stt import STTBase
+
+class FunAutoSpeechRecognizer(STTBase):
+    def __init__(self, 
+                 model_path="paraformer-zh-streaming", 
+                 device="cuda", 
+                 RATE=16000, 
+                 cfg_path=None, 
+                 debug=False, 
+                 chunk_ms=480, 
+                 encoder_chunk_look_back=4, 
+                 decoder_chunk_look_back=1, 
+                 **kwargs):
+        super().__init__(RATE=RATE, cfg_path=cfg_path, debug=debug)
+        
+        self.asr_model = AutoModel(model=model_path, device=device, **kwargs)
+        
+        self.encoder_chunk_look_back = encoder_chunk_look_back #number of chunks to lookback for encoder self-attention
+        self.decoder_chunk_look_back = decoder_chunk_look_back #number of encoder chunks to lookback for decoder cross-attention
+        
+        #[0, 8, 4] 480ms, [0, 10, 5] 600ms
+        if chunk_ms == 480:
+            self.chunk_size = [0, 8, 4] 
+        elif chunk_ms == 600:
+            self.chunk_size = [0, 10, 5]
+        else:
+            raise ValueError("`chunk_ms` should be 480 or 600, and type is int.")
+        self.chunk_partial_size = self.chunk_size[1] * 960 
+        self.audio_cache = None
+        self.asr_cache = {}
+        
+        
+        
+        self._init_asr()
+        
+    def check_audio_type(self, audio_data):
+        """check audio data type and convert it to bytes if necessary."""
+        if isinstance(audio_data, bytes):
+            pass
+        elif isinstance(audio_data, list):
+            audio_data = b''.join(audio_data)
+        elif isinstance(audio_data, str):
+            audio_data = decode_str2bytes(audio_data)
+        elif isinstance(audio_data, io.BytesIO):
+            wf = wave.open(audio_data, 'rb')
+            audio_data = wf.readframes(wf.getnframes())
+        elif isinstance(audio_data, np.ndarray):
+            pass
+        else:
+            raise TypeError(f"audio_data must be bytes, list, str, \
+                io.BytesIO or numpy array, but got {type(audio_data)}")
+        
+        if isinstance(audio_data, bytes):
+            audio_data = np.frombuffer(audio_data, dtype=np.int16)
+        elif isinstance(audio_data, np.ndarray):
+            if audio_data.dtype != np.int16:
+                audio_data = audio_data.astype(np.int16)
+        else:
+            raise TypeError(f"audio_data must be bytes or numpy array, but got {type(audio_data)}")
+        return audio_data
+    
+    def _init_asr(self):
+        # 随机初始化一段音频数据
+        init_audio_data = np.random.randint(-32768, 32767, size=self.chunk_partial_size, dtype=np.int16)
+        self.asr_model.generate(input=init_audio_data, cache=self.asr_cache, is_final=False, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)
+        self.audio_cache = None
+        self.asr_cache = {}
+        print("init ASR model done.")
+    
+    def recognize(self, audio_data):
+        """recognize audio data to text"""
+        audio_data = self.check_audio_type(audio_data)
+        result = self.asr_model.generate(input=audio_data, 
+                     batch_size_s=300, 
+                     hotword=self.hotwords)
+        
+        # print(result)
+        text = ''
+        for res in result:
+            text += res['text']
+        return text
+    
+    def streaming_recognize(self, 
+                            audio_data, 
+                            is_end=False, 
+                            auto_det_end=False):
+        """recognize partial result
+        
+        Args:
+            audio_data: bytes or numpy array, partial audio data
+            is_end: bool, whether the audio data is the end of a sentence
+            auto_det_end: bool, whether to automatically detect the end of a audio data
+        """
+        text_dict = dict(text=[], is_end=is_end)
+        
+        audio_data = self.check_audio_type(audio_data)
+        if self.audio_cache is None:
+            self.audio_cache = audio_data
+        else:
+            # print(f"audio_data: {audio_data.shape}, audio_cache: {self.audio_cache.shape}")
+            if self.audio_cache.shape[0] > 0:
+                self.audio_cache = np.concatenate([self.audio_cache, audio_data], axis=0)
+        
+        if not is_end and self.audio_cache.shape[0] < self.chunk_partial_size:
+            return text_dict
+        
+        total_chunk_num = int((len(self.audio_cache)-1)/self.chunk_partial_size)
+        
+        if is_end:
+            # if the audio data is the end of a sentence, \
+            # we need to add one more chunk to the end to \
+            # ensure the end of the sentence is recognized correctly.
+            auto_det_end = True
+        
+        if auto_det_end:
+            total_chunk_num += 1
+
+        # print(f"chunk_size: {self.chunk_size}, chunk_stride: {self.chunk_partial_size}, total_chunk_num: {total_chunk_num}, len: {len(self.audio_cache)}")
+        end_idx = None
+        for i in range(total_chunk_num):
+            if auto_det_end:
+                is_end = i == total_chunk_num - 1
+            start_idx = i*self.chunk_partial_size
+            if auto_det_end:
+                end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1
+            else:
+                end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1
+            # print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}")
+            # t_stamp = time.time()
+            
+            speech_chunk = self.audio_cache[start_idx:end_idx]
+
+            # TODO: exceptions processes
+            try:
+                res = self.asr_model.generate(input=speech_chunk, cache=self.asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)
+            except ValueError as e:
+                print(f"ValueError: {e}")
+                continue
+            text_dict['text'].append(self.text_postprecess(res[0], data_id='text'))
+            # print(f"each chunk time: {time.time()-t_stamp}")
+            
+        if is_end:
+            self.audio_cache = None
+            self.asr_cache = {}
+        else:
+            if end_idx:
+                self.audio_cache = self.audio_cache[end_idx:] # cut the processed part from audio_cache
+        text_dict['is_end'] = is_end
+        
+        # print(f"text_dict: {text_dict}")
+        return text_dict
+
+    
+
+if __name__ == '__main__':
+    from takway.audio_utils import BaseAudio
+    rec = BaseAudio(input=True, CHUNK=3840)
+    
+    # return_type = 'bytes'
+    file_path = 'my_recording.wav'
+    data = rec.load_audio_file(file_path)
+        
+    asr = FunAutoSpeechRecognizer()
+    
+    # asr.recognize(data)
+    total_chunk_num = int((len(data)-1)/rec.CHUNK+1)
+    print(f"total_chunk_num: {total_chunk_num}")
+    for i in range(total_chunk_num):
+        is_end = i == total_chunk_num - 1
+        speech_chunk = data[i*rec.CHUNK:(i+1)*rec.CHUNK]
+        text_dict = asr.streaming_recognize(speech_chunk, is_end)
+    '''
+    asr.streaming_recognize(data, auto_det_end=True)
+    '''
+    
\ No newline at end of file
diff --git a/takway/stt/modified_funasr.py b/takway/stt/modified_funasr.py
new file mode 100644
index 0000000..5628aad
--- /dev/null
+++ b/takway/stt/modified_funasr.py
@@ -0,0 +1,168 @@
+from takway.stt.funasr_utils import FunAutoSpeechRecognizer
+from takway.stt.punctuation_utils import CTTRANSFORMER, Punctuation
+from takway.stt.emotion_utils import FUNASRFINETUNE, Emotion
+from takway.stt.speaker_ver_utils import ERES2NETV2, DEFALUT_SAVE_PATH, speaker_verfication
+import os
+import pdb
+import numpy as np
+class ModifiedRecognizer(FunAutoSpeechRecognizer):
+    def __init__(self, 
+                 use_punct=True, 
+                 use_emotion=False,
+                 use_speaker_ver=True):
+        super().__init__(
+                 model_path="paraformer-zh-streaming", 
+                 device="cuda", 
+                 RATE=16000, 
+                 cfg_path=None, 
+                 debug=False, 
+                 chunk_ms=480, 
+                 encoder_chunk_look_back=4, 
+                 decoder_chunk_look_back=1)
+        self.use_punct = use_punct
+        self.use_emotion = use_emotion
+        self.use_speaker_ver = use_speaker_ver
+
+        if use_punct:
+            self.puctuation_model = Punctuation(**CTTRANSFORMER)
+        if use_emotion:
+            self.emotion_model = Emotion(**FUNASRFINETUNE)
+        if use_speaker_ver:
+            self.speaker_ver_model = speaker_verfication(**ERES2NETV2)
+
+    def initialize_speaker(self, speaker_1_wav):
+        if not self.use_speaker_ver:
+            raise NotImplementedError("no access")
+        if speaker_1_wav.endswith(".npy"):
+            self.save_speaker_path = speaker_1_wav
+        elif speaker_1_wav.endswith('.wav'):
+            self.save_speaker_path = os.path.join(DEFALUT_SAVE_PATH, 
+                                          os.path.basename(speaker_1_wav).replace(".wav", ".npy"))
+            # self.save_speaker_path = DEFALUT_SAVE_PATH
+            self.speaker_ver_model.wav2embeddings(speaker_1_wav, self.save_speaker_path)
+        else:
+            raise TypeError("only support [.npy] or [.wav].")
+            
+
+    def speaker_ver(self, speaker_2_wav):
+        if not self.use_speaker_ver:
+            raise NotImplementedError("no access")
+        if not hasattr(self, "save_speaker_path"):
+            raise NotImplementedError("please initialize speaker first")
+        # pdb.set_trace()
+        return self.speaker_ver_model.verfication(base_emb=self.save_speaker_path,
+                                           speaker_2_wav=speaker_2_wav) == 'yes'
+        
+
+    def recognize(self, audio_data):
+        audio_data = self.check_audio_type(audio_data)
+
+        if self.use_speaker_ver:
+            if self.speaker_ver_model.verfication(self.save_speaker_path, 
+                                                  speaker_2_wav=audio_data) == 'no':
+                return "Other People"
+
+        result = self.asr_model.generate(input=audio_data,
+                                         batch_size_s=300,
+                                         hotword=self.hotwords)
+        text = ''
+        for res in result:
+            text += res['text']
+        if self.use_punct:
+            text = self.puctuation_model.process(text+'#', append_period=False).replace('#', '')
+
+        return text
+    
+    def recognize_emotion(self, audio_data):
+        audio_data = self.check_audio_type(audio_data)
+
+        if self.use_speaker_ver:
+            if self.speaker_ver_model.verfication(self.save_speaker_path, 
+                                                  speaker_2_wav=audio_data) == 'no':
+                return "Other People"
+            
+        if self.use_emotion:
+            return self.emotion_model.process(audio_data)
+        else:
+            raise NotImplementedError("no access")
+
+    def streaming_recognize(self, audio_data, is_end=False, auto_det_end=False):
+        """recognize partial result
+        
+        Args:
+            audio_data: bytes or numpy array, partial audio data
+            is_end: bool, whether the audio data is the end of a sentence
+            auto_det_end: bool, whether to automatically detect the end of a audio data
+        """
+        audio_data = self.check_audio_type(audio_data)
+
+        if self.use_speaker_ver:
+            if self.speaker_ver_model.verfication(self.save_speaker_path, 
+                                                  speaker_2_wav=audio_data) == 'no':
+                return "Other People"
+        
+        text_dict = dict(text=[], is_end=is_end)
+        
+        if self.audio_cache is None:
+            self.audio_cache = audio_data
+        else:
+            # print(f"audio_data: {audio_data.shape}, audio_cache: {self.audio_cache.shape}")
+            if self.audio_cache.shape[0] > 0:
+                self.audio_cache = np.concatenate([self.audio_cache, audio_data], axis=0)
+        
+        if not is_end and self.audio_cache.shape[0] < self.chunk_partial_size:
+            return text_dict
+        
+        total_chunk_num = int((len(self.audio_cache)-1)/self.chunk_partial_size)
+        
+        if is_end:
+            # if the audio data is the end of a sentence, \
+            # we need to add one more chunk to the end to \
+            # ensure the end of the sentence is recognized correctly.
+            auto_det_end = True
+        
+        if auto_det_end:
+            total_chunk_num += 1
+
+        # print(f"chunk_size: {self.chunk_size}, chunk_stride: {self.chunk_partial_size}, total_chunk_num: {total_chunk_num}, len: {len(self.audio_cache)}")
+        end_idx = None
+        for i in range(total_chunk_num):
+            if auto_det_end:
+                is_end = i == total_chunk_num - 1
+            start_idx = i*self.chunk_partial_size
+            if auto_det_end:
+                end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1
+            else:
+                end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1
+            # print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}")
+            # t_stamp = time.time()
+            
+            speech_chunk = self.audio_cache[start_idx:end_idx]
+
+            # TODO: exceptions processes
+            try:
+                res = self.asr_model.generate(input=speech_chunk, cache=self.asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back)
+            except ValueError as e:
+                print(f"ValueError: {e}")
+                continue
+            if self.use_punct:
+                text_dict['text'].append(self.puctuation_model.process(self.text_postprecess(res[0], data_id='text'), cache=text_dict))
+            else:
+                text_dict['text'].append(self.text_postprecess(res[0], data_id='text'))
+
+
+            # print(f"each chunk time: {time.time()-t_stamp}")
+            
+        if is_end:
+            self.audio_cache = None
+            self.asr_cache = {}
+        else:
+            if end_idx:
+                self.audio_cache = self.audio_cache[end_idx:] # cut the processed part from audio_cache
+        text_dict['is_end'] = is_end
+
+        if self.use_punct and is_end:
+            text_dict['text'].append(self.puctuation_model.process('#', cache=text_dict).replace('#', ''))
+        
+        # print(f"text_dict: {text_dict}")
+        return text_dict
\ No newline at end of file
diff --git a/takway/stt/punctuation_utils.py b/takway/stt/punctuation_utils.py
new file mode 100644
index 0000000..9e038e0
--- /dev/null
+++ b/takway/stt/punctuation_utils.py
@@ -0,0 +1,119 @@
+from funasr import AutoModel
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+PUNCTUATION_MARK = [",", ".", "?", "!", "，", "。", "？", "！"]
+"""
+FUNASR
+    模型大小: 1G
+    效果: 较好
+    输入类型: 仅支持字符串不支持list, 输入list会将list视为彼此独立的字符串处理
+"""
+FUNASR = {
+    "model_type": "funasr", 
+    "model_path": "ct-punc",
+    "model_revision": "v2.0.4"
+}
+"""
+CTTRANSFORMER
+    模型大小: 275M
+    效果：较差
+    输入类型: 支持字符串与list, 同时支持输入cache
+"""
+CTTRANSFORMER = {
+    "model_type": "ct-transformer",
+    "model_path": "iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727",
+    "model_revision": "v2.0.4"
+}
+
+class Punctuation:
+    def __init__(self,
+                 model_type="funasr", # funasr | ct-transformer
+                 model_path="ct-punc",
+                 device="cuda",
+                 model_revision="v2.0.4",
+                 **kwargs):
+        
+        self.model_type=model_type
+        self.initialize(model_type, model_path, device, model_revision, **kwargs)
+
+    def initialize(self,
+                   model_type,
+                   model_path,
+                   device,
+                   model_revision,
+                   **kwargs):
+        if model_type == 'funasr':
+            self.punc_model = AutoModel(model=model_path, device=device, model_revision=model_revision, **kwargs)
+        elif model_type == 'ct-transformer':
+            self.punc_model = pipeline(task=Tasks.punctuation, model=model_path, model_revision=model_revision, **kwargs)
+        else:
+            raise NotImplementedError(f"unsupported model type [{model_type}]. only [funasr|ct-transformer] expected.")
+
+    def check_text_type(self,
+                        text_data):
+        # funasr只支持单个str输入，不支持list输入，此处将list转化为字符串
+        if self.model_type == 'funasr':
+            if isinstance(text_data, str):
+                pass
+            elif isinstance(text_data, list):
+                text_data = ''.join(text_data)
+            else:
+                raise TypeError(f"text must be str or list, but got {type(list)}")
+        # ct-transformer支持list输入
+        # TODO 验证拆分字符串能否提高效率
+        elif self.model_type == 'ct-transformer':
+            if isinstance(text_data, str):
+                text_data = [text_data]
+            elif isinstance(text_data, list):
+                pass
+            else:
+                raise TypeError(f"text must be str or list, but got {type(list)}")
+        else:
+            pass
+        return text_data
+
+    def generate_cache(self, cache):
+        new_cache = {'pre_text': ""}
+        for text in cache['text']:
+            if text != '':
+                new_cache['pre_text'] = new_cache['pre_text']+text
+        return new_cache
+
+    def process(self,
+                text,
+                append_period=False,
+                cache={}):
+        if text == '':
+            return ''
+        text = self.check_text_type(text)
+        if self.model_type == 'funasr':
+            result = self.punc_model.generate(text)
+        elif self.model_type == 'ct-transformer':
+            if cache != {}:
+                cache = self.generate_cache(cache)
+            result = self.punc_model(text, cache=cache)
+        punced_text = ''
+        for res in result:
+            punced_text += res['text']
+        # 如果最后没有标点符号，手动加上。
+        if append_period and not punced_text[-1] in PUNCTUATION_MARK:
+            punced_text += "。"
+        return punced_text
+
+if __name__ == "__main__":
+    inputs = "把字符串拆分为list只|适用于ct-transformer模型|在数据处理部分|已经把list转为单个字符串"
+    """
+    把字符串拆分为list只适用于ct-transformer模型,
+    在数据处理部分,已经把list转为单个字符串
+    """
+    vads = inputs.split("|")
+    device = "cuda"
+    CTTRANSFORMER.update({"device": device})
+    puct_model = Punctuation(**CTTRANSFORMER)
+    result = puct_model.process(vads)
+    print(result)
+    # FUNASR.update({"device":"cuda"})
+    # puct_model = Punctuation(**FUNASR)   
+    # result = puct_model.process(vads)
+    # print(result)
\ No newline at end of file
diff --git a/takway/stt/speaker_ver_utils.py b/takway/stt/speaker_ver_utils.py
new file mode 100644
index 0000000..838393f
--- /dev/null
+++ b/takway/stt/speaker_ver_utils.py
@@ -0,0 +1,86 @@
+from modelscope.pipelines import pipeline
+import numpy as np
+import os
+import pdb
+ERES2NETV2 = {
+    "task": 'speaker-verification',
+    "model_name": 'damo/speech_eres2netv2_sv_zh-cn_16k-common',
+    "model_revision": 'v1.0.1',
+    "save_embeddings": False
+}
+
+# 保存 embedding 的路径
+DEFALUT_SAVE_PATH = r"D:\python\irving\takway_base-main\examples"
+
+class speaker_verfication:
+    def __init__(self, 
+                task='speaker-verification',
+                model_name='damo/speech_eres2netv2_sv_zh-cn_16k-common',
+                model_revision='v1.0.1',
+                device="cuda",
+                save_embeddings=False):
+        self.pipeline = pipeline(
+                task=task,
+                model=model_name,
+                model_revision=model_revision,
+                device=device)
+        self.save_embeddings = save_embeddings
+
+    def wav2embeddings(self, speaker_1_wav, save_path=None):
+        result = self.pipeline([speaker_1_wav], output_emb=True)
+        speaker_1_emb = result['embs'][0]
+        if save_path is not None:
+            np.save(save_path, speaker_1_emb)
+        return speaker_1_emb
+
+    def _verifaction(self, speaker_1_wav, speaker_2_wav, threshold, save_path):
+        if not self.save_embeddings:
+            result = self.pipeline([speaker_1_wav, speaker_2_wav], thr=threshold)
+            return result["text"]
+        else:
+            result = self.pipeline([speaker_1_wav, speaker_2_wav], thr=threshold, output_emb=True)
+            speaker1_emb = result["embs"][0]
+            speaker2_emb = result["embs"][1]
+            np.save(os.path.join(save_path, "speaker_1.npy"), speaker1_emb)
+            return result['outputs']["text"]
+        
+    def _verifaction_from_embedding(self, base_emb, speaker_2_wav, threshold):
+        base_emb = np.load(base_emb)
+        result = self.pipeline([speaker_2_wav], output_emb=True)
+        speaker2_emb = result["embs"][0]
+        similarity = np.dot(base_emb, speaker2_emb) / (np.linalg.norm(base_emb) * np.linalg.norm(speaker2_emb))
+        if similarity > threshold:
+            return "yes"
+        else:
+            return "no"
+    
+    def verfication(self, 
+                    base_emb=None, 
+                    speaker_1_wav=None, 
+                    speaker_2_wav=None, 
+                    threshold=0.333, 
+                    save_path=None):
+        if base_emb is not None and speaker_1_wav is not None:
+            raise ValueError("Only need one of them, base_emb or speaker_1_wav")
+        if base_emb is not None and speaker_2_wav is not None:
+            return self._verifaction_from_embedding(base_emb, speaker_2_wav, threshold)
+        elif speaker_1_wav is not None and speaker_2_wav is not None:
+            return self._verifaction(speaker_1_wav, speaker_2_wav, threshold, save_path)
+        else:
+            raise NotImplementedError
+        
+if __name__ == '__main__':
+    verifier = speaker_verfication(**ERES2NETV2)
+
+    verifier = speaker_verfication(save_embeddings=False)
+    result = verifier.verfication(base_emb=None, speaker_1_wav=r"C:\Users\bing\Downloads\speaker1_a_cn_16k.wav", 
+                                  speaker_2_wav=r"C:\Users\bing\Downloads\speaker2_a_cn_16k.wav",
+                                  threshold=0.333,
+                                  save_path=r"D:\python\irving\takway_base-main\savePath"
+                                  )
+    print("---")
+    print(result)
+    print(verifier.verfication(r"D:\python\irving\takway_base-main\savePath\speaker_1.npy",
+                               speaker_2_wav=r"C:\Users\bing\Downloads\speaker1_b_cn_16k.wav",
+                               threshold=0.333,
+                               ))
\ No newline at end of file
diff --git a/takway/stt/vosk_utils.py b/takway/stt/vosk_utils.py
new file mode 100644
index 0000000..b67cfa5
--- /dev/null
+++ b/takway/stt/vosk_utils.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ####################################################### #
+# VOSKAutoSpeechRecognizer
+# ####################################################### #
+import json
+import wave
+import io
+import os
+from vosk import Model, KaldiRecognizer, SetLogLevel
+from .base_stt import STTBase
+from ..common_utils import decode_str2bytes
+
+class VOSKAutoSpeechRecognizer(STTBase):
+    def __init__(self, model_path="vosk-model-small-cn-0.22", RATE=16000, cfg_path=None, efficent_mode=True, debug=False):
+        super().__init__(self, model_path=model_path, RATE=RATE, cfg_path=cfg_path, debug=debug)
+        self.asr_model = AutoModel(model="paraformer-zh-streaming")
+        
+        self.apply_asr_config(self.asr_cfg)
+    
+    def recognize_keywords(self, audio_data, partial_size=None, queue=None):
+        """recognize keywords in audio data"""
+        audio_data = self.check_audio_type(audio_data)
+        if partial_size is None:
+            rec_result = self.recognize(audio_data, queue)
+            rec_text = self.result_postprecess(rec_result)
+        else:
+            rec_result = self.partial_recognize(audio_data, partial_size, queue)
+            rec_text = self.result_postprecess(rec_result, 'partial')
+        print(f"rec_text: {rec_text}")
+        if rec_text != '':
+            print(f"rec_text: {rec_text}")
+        if any(keyword in rec_text for keyword in self.keywords):
+            print("Keyword detected.")
+            return True, rec_text
+        else:
+            return False, None
+
+    def recognize(self, audio_data, queue=None):
+        """recognize audio data to text"""
+        audio_data = self.check_audio_type(audio_data)
+        self.asr.AcceptWaveform(audio_data)
+        result = json.loads(self.asr.FinalResult())
+        # TODO: put result to queue
+        return result
+
+    def partial_recognize(self, audio_data, partial_size=1024, queue=None):
+        """recognize partial result"""
+        audio_data = self.check_audio_type(audio_data)
+        text_dict = dict(
+            text=[],
+            partial=[],
+            final=[],
+            is_end=False)
+        # 逐个分割音频数据进行识别
+        for i in range(0, len(audio_data), partial_size):
+            # print(f"partial data: {i} - {i+partial_size}")
+            data = audio_data[i:i+partial_size]
+            if len(data) == 0:
+                break
+            if self.asr.AcceptWaveform(data):
+                result = json.loads(self.asr.Result())
+                if result['text'] != '':
+                    text_dict['text'].append(result['text'])
+                    if queue is not None:
+                        queue.put(('stt_info', text_dict))
+                    # print(f"text result: {result}")
+            else:
+                result = json.loads(self.asr.PartialResult())
+                if result['partial'] != '':
+                    # text_dict['partial'].append(result['partial'])
+                    text_dict['partial'] = [result['partial']]
+                    if queue is not None:
+                        queue.put(('stt_info', text_dict))
+                    # print(f"partial result: {result}")
+        
+        # final recognize
+        final_result = json.loads(self.asr.FinalResult())
+        if final_result['text'] != '':
+            text_dict['final'].append(final_result['text'])
+            text_dict['text'].append(final_result['text'])
+            
+        text_dict['is_end'] = True
+        
+        print(f"final dict: {text_dict}")
+        if queue is not None:
+            queue.put(('stt_info', text_dict))
+        return text_dict
+        
+
+if __name__ == "__main__":
+    '''
+    wav_file_path = "recording.wav"
+
+    # You can set log level to -1 to disable debug messages
+    SetLogLevel(0)
+
+    model = Model(model_path="vosk-model-small-cn-0.22")
+
+    # 调用函数进行录音
+    # record_audio(wav_file_path)
+    data = record_audio()
+
+    # 调用函数进行音频转写
+    result = audio_to_text(data, model)
+
+    print("-------------")
+    print(result)
+    '''
+    from takway.audio_utils import Recorder
+    rec = Recorder()
+    
+    return_type = 'bytes'
+    data = rec.record(return_type)
+    print(type(data))
+        
+    asr = AutoSpeechRecognizer()
+    # asr.recognize(data)
+    asr.add_keyword("你好")
+    asr.recognize_keywords(data)
\ No newline at end of file