From 31017e1ec483e7d83c0d42070d0585cf6348e7fc Mon Sep 17 00:00:00 2001 From: bing <2524698668@qq.com> Date: Sat, 11 May 2024 22:02:52 +0800 Subject: [PATCH] utils for punctuation and emotion and speaker ver --- takway/stt/__init__.py | 1 + .../stt/__pycache__/__init__.cpython-39.pyc | Bin 0 -> 172 bytes .../stt/__pycache__/base_stt.cpython-39.pyc | Bin 0 -> 2382 bytes .../__pycache__/emotion_utils.cpython-39.pyc | Bin 0 -> 3525 bytes .../__pycache__/funasr_utils.cpython-39.pyc | Bin 0 -> 4491 bytes .../modified_funasr.cpython-39.pyc | Bin 0 -> 4507 bytes .../punctuation_utils.cpython-39.pyc | Bin 0 -> 2717 bytes .../speaker_ver_utils.cpython-39.pyc | Bin 0 -> 2855 bytes takway/stt/base_stt.py | 65 ++++++ takway/stt/emotion_utils.py | 142 +++++++++++++ takway/stt/funasr_utils.py | 186 ++++++++++++++++++ takway/stt/modified_funasr.py | 168 ++++++++++++++++ takway/stt/punctuation_utils.py | 119 +++++++++++ takway/stt/speaker_ver_utils.py | 86 ++++++++ takway/stt/vosk_utils.py | 120 +++++++++++ 15 files changed, 887 insertions(+) create mode 100644 takway/stt/__init__.py create mode 100644 takway/stt/__pycache__/__init__.cpython-39.pyc create mode 100644 takway/stt/__pycache__/base_stt.cpython-39.pyc create mode 100644 takway/stt/__pycache__/emotion_utils.cpython-39.pyc create mode 100644 takway/stt/__pycache__/funasr_utils.cpython-39.pyc create mode 100644 takway/stt/__pycache__/modified_funasr.cpython-39.pyc create mode 100644 takway/stt/__pycache__/punctuation_utils.cpython-39.pyc create mode 100644 takway/stt/__pycache__/speaker_ver_utils.cpython-39.pyc create mode 100644 takway/stt/base_stt.py create mode 100644 takway/stt/emotion_utils.py create mode 100644 takway/stt/funasr_utils.py create mode 100644 takway/stt/modified_funasr.py create mode 100644 takway/stt/punctuation_utils.py create mode 100644 takway/stt/speaker_ver_utils.py create mode 100644 takway/stt/vosk_utils.py diff --git a/takway/stt/__init__.py b/takway/stt/__init__.py new file mode 100644 index 0000000..413fa1f --- /dev/null +++ b/takway/stt/__init__.py @@ -0,0 +1 @@ +from .base_stt import * \ No newline at end of file diff --git a/takway/stt/__pycache__/__init__.cpython-39.pyc b/takway/stt/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22e74bb9fefea84f6a7232083b5a17148b00b8ba GIT binary patch literal 172 zcmYe~<>g`k0;3A+G;tvP7{oyaOhAqU5Et_Ri4=wu#vFzah7_h?22JLdj6fkx##@Y9 zewvI?97&1Asqw`nB`X<t;+KVsRZKx;Nk)ENOlDD8W?p(sNn&<+Vr4u?wQg== nW*$@^2Bb+0gl=lZ4o3znkG%!UQ{44P*gBN&{|4nB9qGQ zGL8uuDC(RF6#W530r}X!wAY?;?k%T&vy%KmNpSWtvoky2w>ylNmg)@8`kQ3^^}3#G{jM*C^oEt4YPZVScTD=S@|MZUf#}vG zf5uwXcksnJ?*wFv+onuo2h!8W`j)=zub-YqAhIn_D-B;vx!Y@ z%qC3&GSeRI*&vG2EHzO?el&$(yo$GM&eCkp4SIz6{Gmhxd@j-7(2;B(T)JcdalzHt z@G+O1`25}bt7G5cW_36B*Pg2Cf$YY+(f&%lDfP! z%wriM#juxW{d5ra(_wNqEEm7eP3TN8SnuR#O~R4NU#BvWYhj_`Gn7d`E{102rrio` zfGEy`_484ZS)bUnJ_WSiAThQ!%m)LkwSJoQb6eYuRfb{9SGYw?(WUBopxN@iV|OYV zq$9|i`1C~x#(fYK@Fu^-mmx2TIxkn|v>l(MQ^XO8M4lt=IXVQTwym0?lx;l);M&AlFR>o!?Yh} zNiWelR(tJ#DQ_#(R{4cnAG!5&4%~vZITdN%en=JehacC%t-^!@T$I+bq9SK2&Q#q% zUxg$HRYeF{k>Z5U7FvHd#)i(*d|f56?5lVu(YB6Tkxl#RBU1X5Ld$au+WPUA5D$iI(X2RAc#=`MjQA@Hh5F4@bb!>*|Q4~HhE)uwC(vr1Rs%n3j@@1ah0K% zp{6^sS3$6FWevq`;-TE}Ln=aEd1I7oGg1!h5KTVF(lU|gY7~sNx(avd8U!vrp?D0R zt*K-*jC;wfB-rY74Sq)8H%g`YgzTweAJ+!kq}@j+g5H5%Q^jB+a1b1)e%AC<(gXm< z_XwY+?A=8c^2vUynIdm-A_f7=jWsxY2m5@ zgU*u{cj?mKa)EwV8O}@;*?JW10d*R^d}k-mqQay@?GC(K@qa;V T@YQs`_X*!wqyXic-iG)OP)$OX literal 0 HcmV?d00001 diff --git a/takway/stt/__pycache__/emotion_utils.cpython-39.pyc b/takway/stt/__pycache__/emotion_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e19cdb0d6bfbb1e020e67f18f6fa8c0f603c3aa GIT binary patch literal 3525 zcmZ`6O_SS5wYw$Dvd81`c#>>(VJVRm0aFu`WJ6f8Aw`xFrjzu?S)1INB{qH1s4!smM}dp?q^T>aGQe(&|`_w8VD zu}c!iL^j|6Ha5&FWkW>NiD0 zH1}!0$(Q>}qBd>uwpfbj0i{GN3`j&o?N9Xgw7)E>)9L}~x5e^wMKlj+|I}>dv^XP9 zi}qQvk2Gh*DSUIfPC6_9fIE?1hw9p`T%|utxrnth$de$GFHS{K_$@I^Rg@-Ic17rK z2AO!_>|W_!>R#CGzQtU<*Gi*E3+N?s({`BiK%%xn1oHH3yX$!yM&k*+3?Qs82`b`S1i{94;( zwjaoJxS0pP6CNjZew}hV!tDWbDKQq z?p1$72Gpc-x2R6-19BV6?w(t?u-n|L!CIxLVex+S!K3Si!>ey%u5u@z*N=PJk;1N* z>1d|W!)yf38iYZf2`>)DLMEvko?sxJC>eyx7sE}#ITW_*UIu3O4!qiIbg)Oj7 z9tSc~W4-)}{MrGgBF?l^@;uQhV9LZaiD@@G3t$V3_@D&_q`xJ*RN4joBYR}8AIr%d zrdjurraRARzk!8b!SHv)_$XuR`VEYxFddF=j)1Ae>OMI?cd?O7F{yNK#Az7B*-eS1g7>@XY%!7CBrZz4Gf|)6`Q*7I|HLhCrdBg zrehy<6>=GwVgCTDSK$Sb)2U)nPat9VzOMT|)WAH(dBgV~Zf_Gs-Fj_r{?dGI?%M_SF6hoAy z0mmM|kza!?c>#c~LBbpbkUTT_b^NZ*vu9-_!Jj&|b|KY5ryC1-1reMQ8#-*0agS-l zSZoOonWaG_+69`-{5b(jxsd8ERAmqOfCbQFeSgyKZipuVb{`v1lT;VZGggCMQAE3; z=)57%BSpoO5ZQsbtRXPBz>HV+G`5O{qF=vrzjtf>7w_L%U(?NUe&=p)?cV*~nyDkk z5Jm*KhFqN}fpn3X#!5T+2ui6aJ)LIK*!&ivQ6+zjCmq?|%*>tL8>I)`#E8kz$T9}7Cq!JGXH2q&EOh+qCwpXxsI4ftn0 zvv2tp#WAC2_bYxyx6Pi@uliL=ULxG$_ASC~QQbCtSJZfAlgWR8ywj`q8-9Zl(co3? z^4bpbCwSeT6cf8mF?pN%Q^MUQ{%wG^w(1Uq1M9kyNvi%b_Py91tdiSh2BNlEm|G{g& zD&4^{dd;P#+Xxw^Xge3*#{XB>Z={Z95A zGz3AL`HY|XfV3&@^;Q+ESL5|t#5Z|^Pr$#$CwUY8Z9c=N`1CF6S3V#n@g30UfnsiX znBa%64upt0uYp<{Njwtr^SMDN!*(kBLY^CS&Vi;P?8nJQVMaq9?!EUc>|}ZGuauBI z6I9uQ28pUb4jzmH^mYy2?598!*(P0@8@ZWVobDKuDAp!9%^7FEW_k6u^k+0Dfst2$ znz;ki%583dS6|*B-250v?c^-*Ea0sH&)(-bs=_PV^nH-%+T58ZygEZb+Tlp6F;X`t zyS3W{qX>5HuC$E8eC_h;i-i-lH-bT^I)%l>`f#Jge5;RsC~PjS$B~e@t(GBii+kt* ziF=FRWJg?N?BQ-lPreg%hRJ5o&))H}PCD#yZ(Vp_zi{45rT6st^NU`Xa8GRwgcoOC zoTw#<7K%SLV6}Q-Wy67xg%bpE600DPz$QgC%w$mNG$#@a9T}9L>80srupUO6#atOv zUmS2t$qd|DxIYMcL-C@NsVt^TKKNP1VK30*wQ$JeG>F2eBg(JtH%oOujh0)Ondr3( zSL>qj)fr_@Lb}uwQ^T`!f&)p{nRMll4UEHiPyfobDPqy-mjy zbVLagxKAr9r`yPTS@Y~}1-Nf3XSbSD&Tg|Y%h|3AF9f8X>uI@qw*hj!WLiydLrvlu zB?P@s^*Db0m#W@v!guQ3DQ@mC_{N1}b3e^#cN)hw4f&^>?w-Q7{sOLP-Mwaxpk$Ap z0bfU(p7y-QLlu_MiD4S0$#o%>7b{Pto<=G!ZhMJ{M3#keYw6!$7*a5~K6M%P-+)>@dS| zfRl%?>mKc?U^m+GQDBzjF&u7c-vp%)&ig8Gvo8Z7%%zOF5ah0=bqMo1Wz3}Z9mjC# zH1z6pf=$!W6ZZ@JlZ!`)a0d?>1;I`qfbT&=0eW{ChX4kYHK1p5W5*mL=2~4FaGZ22 z+yn~9-L1mwa%;y%yan{FX$-EYI=3;SY7DNd#N@p@cj7zm0`xI}6-+#6q~%X+wAo;& z3QOaU@5EWqj+3w_Ps8{TqKsw`CcVp7U-L8^c|1+TQmY}q24aORA&Ap{QNe%kfkcFm z7mz%FK?713U<;mOFe^9#Py|~BXFJB2abs-c49b({*u?b8&3W<- zGB&O~o*Qt&t!FG|Iu`_4QR$@WMk;wh3KKe^?Zh#XrvlOJ*$ zd}GL8K*?i)Pb|*HwyNfqE+_vvuE5yuq2Pg0EGZ!`K=ISHXUN!rk-r^|5c!w9!mT;I z`V2`JuZ-E1_guqh$2Bn&Q>Gc2ESrLEeJ0d|dl+?b_~9J8?_id2*gWXsySFwOX?JO*8+e&BH=JOO?!*pjgC+qAw3|+ zPZJb#$N@nwZF`}Yi9|uEChl4(gi1T$Lfvz{6qZwIKU8rP_Ig_$$cYFN@9|adp&?+C zK4*5R=87&;A!(Z7%{bGwox-v*J^OkwgkQsq%E zK``rtP?c&-twBu?0c0i!n~0}{0RUMTJ;;<6;OvGtf`@y;>O9PmS8$>o4j{ksqKViN z4AM*uz|T-wwCYD-qID|j$1PV>^#oM1m{h5P;fyup(^bWR30Z{g>uI0;Lq4sSWZVZgl&qS-PKQmX?*0Z185I?HCM$xbuVG$G+Sh6$+_Ul*vy z7R(8Q(Z?VmPtZr`Lk6Y*2WrwLNRGa8zf^FKZ?_z2!-r)B3FhLJWqlXX_Ydvb7F%9! zRq?uzKz$Mfg&W{KVc5fVJqWH1!(KU}%jD9hn5E>aNN{sSW4{!SM6U-Ijz@7olZUZ| zWr)ONghhRZHH*XG5_SCd*@WjTIF2(3-2Z4>X>_xBjkUWR;OcnQ`mea+ej!FMid0kkF(tbZp z%A0{M>U_JcFAMq>btndxh$XyP!1Y^6%ph848Q|j6`%9Ieq3;-y;{c?wsHnDMu&ll- zPWIA}?-ypsg+l{}s4(DaBCq0_t4IVAAITLYuOs;(5g4GH={zJ=t-^16kVFPuxI nJD literal 0 HcmV?d00001 diff --git a/takway/stt/__pycache__/modified_funasr.cpython-39.pyc b/takway/stt/__pycache__/modified_funasr.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6a7b297ed55c527ac3cfdf4b10600b0fb3b2432 GIT binary patch literal 4507 zcmb6dU2hx5ad+>7$1j!y$DpWOBrhub)R&@w1oWYKZht_Y`vdy0=tF@%`5zQUTX$xUq9|J}+9PgmcV~8X zcV>1z7*wkz0?!`~Z*2Z$o{+y`XY#S3^A@~m1b`DxBhsZUO0mr%*0owz*KXOSZ$(bm zZMoe-t6=(ePqw7<)yXT%S*bj)J+V^(xs)9rPXUNe&?E=zPZ#~eE;^Ezq)v5$-lL@ z_KrTA_C&BFq<>$?&9H44F1K+%lMl)UpY$C7g0v_nEymA$Mm8z8xc#u$vbe+Dhooim z0{7tS@FFk4*X3nifp3A=c$H5*q%H3=Vv|-8&ONPXK1g`D847-U*0pB)Xf<3B+o(4S z-ZTcF$R6oXPC0{>3Ab@wTyx)}B%>MeS;hcbnFY|!sQj^VHc3d7`#bt8#1}Fq?^=_5vAfCbBE!)xq}FR7nwZ!+1;E?LH5}pT9{+GyE$h zy3|d$i2PolwzbQ}{je>xy|TErq`mg$RzB`mgeW4SfQab{UOBR1YdaOurkq2f3kY5W@CDUH-)~1j zn)?11*``QC;%-3E}64oRmJ z*xVld1iy0JDZzG+4*7+F_#$2o9S!F;`p9@2v%SSdS%pRq~ODIJt?UdK9Q zl*FE@a2vTATXUA56yu~GwA&(02lhhT+XeU_xIeg*#L;dY6x~ats{afzck79S7GUo# zG^$3<`s{L|-tG1x(G{^0d`Zd#3`4{`eGq~fPN%{BV+$~r$&ystre$}MFqR&iS9_?3 zIOqypkfIj_Z6Ut`^Rs2-Wb8$Qs=ufkXkUPit% zXrN&bflE8GL!?O00U)3+kCsO7%v0BzVS}%oqO<3DJf}UhYoopILgS?N6zGAoJ(ku% zpO(=l=%Ql-m6K6!?=$Y~Fu3>bvU~+*OSG`TOCVAZN&R^3`skqb%}}PQ!CFRUaMd>S zSO(Wn2HM&cX~Q-8k(Ys)E^G&Bpp-N+)#VAlYZe=cQZC@?&j_%5Bwm>GO~9u!0LGGI zCCZ-K@D47WBDaaDCj^Ff9A6`_Poa^K18}5=bckk3<*NW$2b?Jg39bMLi0v@>Dz{cq zv`)uGu|ccAoqcP_xI45mc9Yz#4Q+725DqSo?~$Q(uL(4s8~`kKz}<3hmSBk3X9s2O zarcnHJmA?VFMmPJ;WdksCzTJ?wvhE(A^{bIuYqrESlS8WUSH|TMxff;ern7K#G0rF ziR7ASTZvE96Z%TCL0i?J*Mke-gNqCk-l*tWdt0=3{GiXn#8+UzxoXhBLpaShmk@h1cuOqbuooAI= zlt*#?`#_SSjSyO5E?kCOoi?+X2M*5umuuN@L8joU%(KD#DM}u*I-z3pXyZG95&ZvC zvAh9Fa}Nq=6stseCp?z?7VwkbM(`a3cxf6%c>~+%`;Cy(yvp#J8?X$*`rpN)oGp?ubQ@sp zASCYeiG0Td1~E84A5%iBv_s_QnGK>Y zs|sTFDEH>@wv{fDJvwwCf%!cz;(d{S%pC5){c^Z7bX7HT_UKLWh~B+CECBY;Spo5L zWQPqIw1LApGW0U*5tZH;2lNuKcq4O)nDNrkjpvbK2eI|%vAD0y%iLeG3uIUvT!#Ks z=S+s@t6>7xnTB@;-m|>C?+!~Jk@zi}sB@3#?yo5sLW(@NqF&&Yto$4DTeb&j_DAGn zNS24?Ps{hdJZ{IOeC&1dI_~Z~?B-A4RjL=r!7TLWIu`)Gn3ar_A;AQh+WgEu1ZYa` zeK|pSf@Xwe!0jL0K`n&9-LJ*xGxrgbxNkQDZQ|*29Ra7>jeG}8zLznedT)X|?1N9| zD<53s(|iU(KI{Os*f<+p1iv*lLCP07y-h1vRM!6 zkOL^lN5u(F0hA{`7wS}gkh=i73q?dbh#+ABI?+}qxt_AaXM70w1LG{+eAmkoq=(g! z9efg|vI?EG$4}vbsEPqzJM92`y7qv4GmL{sm-6fx1B1leLcRcCP`MLC{XAW{UN7^cRg>?)NwLnkD#5N1QsR|X*UmY5+joF ztGb3E(C;Ow>Ve^n<5R`NCuuw_9#vN+^{ISC(E zhE1;mc?Hy}IiW-ss;PL^i=Q3WO#b*9d<>-NIsoE58|*5O$#_8<$*tf}|% z{{u!O7OsO~zcn{OcT~*fY%uXNrQQi%6RF~e7uG|jHjt+L`#N;B)#Dr5Fe6`$E}w|BlfUu=`sX+cMRfve&xgg_-^O0Bd(CFEK;j#OlIliS;q>ZhuH`Q>VpS+edf)=ti~K!n{bcuh-BTUCs10xvS8gPHL?@nHEt=)L;Ojve#L-?8RHg8u zv>%HB7sH6!tgWZi8_$J3B#yaKp=k>dU_m&oaV$OIk`-M(sDgF`bI!<69b2d0(;P2M-vT@ zEowm&$iNGOA^*rKjFg9cwIG_fYB7M%=VDSZYj1dBCH;tqCxP8tc=5-}$%b5wqvb%X z2hqy1^gv?6z2v2QZr}yc{={-B9ElnJ~k}FyV`UKOqjT{QcCWTp(>C%#b>3igT4UvuJmA(P`r4H&Z z=X9l%M~i4>_5bQZv>`5L8JB_#k4yfhY!9+jwlDE^Dn&aM?J!8?iT0&TwpU`=E<;rr z<$M}Msq`YB7dnU+1_~F(Jo%XVF+;r5-!R&z+I<`o4CxA-ea&9lm06A*Ic!Z z8b3scj=@0m8m&7g}e@y=n?uhvT?k9m^lxYFo)G(iC$&2LT)0=v( zGi^;5tz2VtmgM>qa0eKjCs$wwkTQBufdyA56m75po`EyUkB*_-D@@;mV-(ehzu_uU zbU^TNSDPz5;=+@>l=%gSl*t5%REIuAhsVY}J1l<2M^8qr0>pu5w}IKe!*vGH&B-?K zm_1m>z&lNJfzq~hHETrAsCm56ujP6`!26$rj*;u2qoX9$KAjo2v`vH2O*1zDsn({I zn>jp-v*bClXww=>E%~??zx4~9T-hlaUXt*Lxd|6R%+v;j zC3u>J(owgZGZnW&hsnpoQAtGX^E5pV`=nSyiD6R%a;!$1&@tPZz}vK`wO|}hkH=>i<;{~1#lnx^TKjPV4ADCQi6aEiyz?db?H%* zsz31Oh9E*IR#00bRQ>){tJw|v3A?IM?cV}raA$l!TsKRB-Mx8p=g;px{Nzs$?ti-b z){UKC-*|BUGrX}oZ-1OWx_a&5@9sVL?DpYGx_j^L&IfPhJ9qBwzVp}JTYuU4@cP5w z-gvZi547{0zkT}f?{9-yH1g%=pMbfYTkkx$|M9^#1n)eB=CB)fuU(ws;oxtuq&T3+ES$#_6S{#f9GaGp{V3dudVmEl!~o zD~OUzriGa%VIUm?{(_|H%_*!bf%?N8coTey5#eD8_V-cGOhgcsmwF6+1irB6%kVu< iflKiTsKa%wYqkxT1L{_N|7$hRLIRmJNSdZfq5lH3&CY@V literal 0 HcmV?d00001 diff --git a/takway/stt/__pycache__/speaker_ver_utils.cpython-39.pyc b/takway/stt/__pycache__/speaker_ver_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95b3df3d25ce613862b5a2b73863fa7f2ce415a1 GIT binary patch literal 2855 zcma)8&2JM&6yMn|ZyY;>Bv4YISd<HzxSKpTfbZ`YEXXu z>CWa4Gn)1rHYQ5}8h7Cq--DqVC7~9P79m*IL%n6Fr`a;q(`s4NpeD7bO$$GoEt@*6 z!si;DqQ&PLEk4s*4x6IWJ39Z4Ip9;;C#@nYZckI4mgx*VMQ7>heXCVsWjgl^zQW3^ zG=8GjJY4{vGy4X%3!=yT9mcBz#)HkE z4&ZV|&W};>0EHX`ZosVEM1voaW~Wd6m5vSdykH&gdvJ>jU{Y;FM*7GYnUw6CBhAxB z7Fbvqd`54&CP#N^DO9+Fb(;b<9p>s>hbEr_8D;>`Z;vn_BS(s1&t6@hv8DD=n*6_l_upF9NKB(D2kn(~s zAzliOkDxNrwgHeWLJ8IP^*v)`P-9V}=AyP|j&!Q^3BQq&105hU4|SrAOn}L1uDu|# znDoGaSXoklC)nXtYk`*m(X$TqRNFQWEF3YeS%+jnJCAX)N5+oH*QvD%k%Jo#f1$a?=ioEPs7qs) z31D*K+8l9}W|;Uiez+ygIN=e5K@r_ST~sJt#@XISgMe}bMixa71)&rd?(ZSo|-Suz>E4r8;kPl}50SAg=SKEw=w$zf6!YyUCN?-yJ2lV2bF^4Z%} z5}_V}9>UQ-H|}F(7#r*Zr{1mN`#2uOXrzAfofl zB>fW8PsHMsQPm9r!&vZZXpr&zI2MZGD9#h&GCiRfx1eh*v$Fv5ESZ6ORv%t^6Vwyq zK4#V^dEU2uKg@dgX`JX?G`PiV zvkj~ruUpmdm~m7=Y>D^4IJ!p0*Pq35vEWQQ)5mX=FRlR7w&%G-DaWvS?{@o>UZktd|O=X=>N1Df)*dq#+{K1>YqS#k{+w&zh@#-11s)Fy 0: + self.audio_cache = np.concatenate([self.audio_cache, audio_data], axis=0) + + if not is_end and self.audio_cache.shape[0] < self.chunk_partial_size: + return text_dict + + total_chunk_num = int((len(self.audio_cache)-1)/self.chunk_partial_size) + + if is_end: + # if the audio data is the end of a sentence, \ + # we need to add one more chunk to the end to \ + # ensure the end of the sentence is recognized correctly. + auto_det_end = True + + if auto_det_end: + total_chunk_num += 1 + + # print(f"chunk_size: {self.chunk_size}, chunk_stride: {self.chunk_partial_size}, total_chunk_num: {total_chunk_num}, len: {len(self.audio_cache)}") + end_idx = None + for i in range(total_chunk_num): + if auto_det_end: + is_end = i == total_chunk_num - 1 + start_idx = i*self.chunk_partial_size + if auto_det_end: + end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1 + else: + end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1 + # print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}") + # t_stamp = time.time() + + speech_chunk = self.audio_cache[start_idx:end_idx] + + # TODO: exceptions processes + try: + res = self.asr_model.generate(input=speech_chunk, cache=self.asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back) + except ValueError as e: + print(f"ValueError: {e}") + continue + text_dict['text'].append(self.text_postprecess(res[0], data_id='text')) + # print(f"each chunk time: {time.time()-t_stamp}") + + if is_end: + self.audio_cache = None + self.asr_cache = {} + else: + if end_idx: + self.audio_cache = self.audio_cache[end_idx:] # cut the processed part from audio_cache + text_dict['is_end'] = is_end + + # print(f"text_dict: {text_dict}") + return text_dict + + + +if __name__ == '__main__': + from takway.audio_utils import BaseAudio + rec = BaseAudio(input=True, CHUNK=3840) + + # return_type = 'bytes' + file_path = 'my_recording.wav' + data = rec.load_audio_file(file_path) + + asr = FunAutoSpeechRecognizer() + + # asr.recognize(data) + total_chunk_num = int((len(data)-1)/rec.CHUNK+1) + print(f"total_chunk_num: {total_chunk_num}") + for i in range(total_chunk_num): + is_end = i == total_chunk_num - 1 + speech_chunk = data[i*rec.CHUNK:(i+1)*rec.CHUNK] + text_dict = asr.streaming_recognize(speech_chunk, is_end) + ''' + asr.streaming_recognize(data, auto_det_end=True) + ''' + \ No newline at end of file diff --git a/takway/stt/modified_funasr.py b/takway/stt/modified_funasr.py new file mode 100644 index 0000000..5628aad --- /dev/null +++ b/takway/stt/modified_funasr.py @@ -0,0 +1,168 @@ +from takway.stt.funasr_utils import FunAutoSpeechRecognizer +from takway.stt.punctuation_utils import CTTRANSFORMER, Punctuation +from takway.stt.emotion_utils import FUNASRFINETUNE, Emotion +from takway.stt.speaker_ver_utils import ERES2NETV2, DEFALUT_SAVE_PATH, speaker_verfication +import os +import pdb +import numpy as np +class ModifiedRecognizer(FunAutoSpeechRecognizer): + def __init__(self, + use_punct=True, + use_emotion=False, + use_speaker_ver=True): + super().__init__( + model_path="paraformer-zh-streaming", + device="cuda", + RATE=16000, + cfg_path=None, + debug=False, + chunk_ms=480, + encoder_chunk_look_back=4, + decoder_chunk_look_back=1) + self.use_punct = use_punct + self.use_emotion = use_emotion + self.use_speaker_ver = use_speaker_ver + + if use_punct: + self.puctuation_model = Punctuation(**CTTRANSFORMER) + if use_emotion: + self.emotion_model = Emotion(**FUNASRFINETUNE) + if use_speaker_ver: + self.speaker_ver_model = speaker_verfication(**ERES2NETV2) + + def initialize_speaker(self, speaker_1_wav): + if not self.use_speaker_ver: + raise NotImplementedError("no access") + if speaker_1_wav.endswith(".npy"): + self.save_speaker_path = speaker_1_wav + elif speaker_1_wav.endswith('.wav'): + self.save_speaker_path = os.path.join(DEFALUT_SAVE_PATH, + os.path.basename(speaker_1_wav).replace(".wav", ".npy")) + # self.save_speaker_path = DEFALUT_SAVE_PATH + self.speaker_ver_model.wav2embeddings(speaker_1_wav, self.save_speaker_path) + else: + raise TypeError("only support [.npy] or [.wav].") + + + def speaker_ver(self, speaker_2_wav): + if not self.use_speaker_ver: + raise NotImplementedError("no access") + if not hasattr(self, "save_speaker_path"): + raise NotImplementedError("please initialize speaker first") + # pdb.set_trace() + return self.speaker_ver_model.verfication(base_emb=self.save_speaker_path, + speaker_2_wav=speaker_2_wav) == 'yes' + + + def recognize(self, audio_data): + audio_data = self.check_audio_type(audio_data) + + if self.use_speaker_ver: + if self.speaker_ver_model.verfication(self.save_speaker_path, + speaker_2_wav=audio_data) == 'no': + return "Other People" + + result = self.asr_model.generate(input=audio_data, + batch_size_s=300, + hotword=self.hotwords) + text = '' + for res in result: + text += res['text'] + if self.use_punct: + text = self.puctuation_model.process(text+'#', append_period=False).replace('#', '') + + return text + + def recognize_emotion(self, audio_data): + audio_data = self.check_audio_type(audio_data) + + if self.use_speaker_ver: + if self.speaker_ver_model.verfication(self.save_speaker_path, + speaker_2_wav=audio_data) == 'no': + return "Other People" + + if self.use_emotion: + return self.emotion_model.process(audio_data) + else: + raise NotImplementedError("no access") + + def streaming_recognize(self, audio_data, is_end=False, auto_det_end=False): + """recognize partial result + + Args: + audio_data: bytes or numpy array, partial audio data + is_end: bool, whether the audio data is the end of a sentence + auto_det_end: bool, whether to automatically detect the end of a audio data + """ + audio_data = self.check_audio_type(audio_data) + + if self.use_speaker_ver: + if self.speaker_ver_model.verfication(self.save_speaker_path, + speaker_2_wav=audio_data) == 'no': + return "Other People" + + text_dict = dict(text=[], is_end=is_end) + + if self.audio_cache is None: + self.audio_cache = audio_data + else: + # print(f"audio_data: {audio_data.shape}, audio_cache: {self.audio_cache.shape}") + if self.audio_cache.shape[0] > 0: + self.audio_cache = np.concatenate([self.audio_cache, audio_data], axis=0) + + if not is_end and self.audio_cache.shape[0] < self.chunk_partial_size: + return text_dict + + total_chunk_num = int((len(self.audio_cache)-1)/self.chunk_partial_size) + + if is_end: + # if the audio data is the end of a sentence, \ + # we need to add one more chunk to the end to \ + # ensure the end of the sentence is recognized correctly. + auto_det_end = True + + if auto_det_end: + total_chunk_num += 1 + + # print(f"chunk_size: {self.chunk_size}, chunk_stride: {self.chunk_partial_size}, total_chunk_num: {total_chunk_num}, len: {len(self.audio_cache)}") + end_idx = None + for i in range(total_chunk_num): + if auto_det_end: + is_end = i == total_chunk_num - 1 + start_idx = i*self.chunk_partial_size + if auto_det_end: + end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num-1 else -1 + else: + end_idx = (i+1)*self.chunk_partial_size if i < total_chunk_num else -1 + # print(f"cut part: {start_idx}:{end_idx}, is_end: {is_end}, i: {i}, total_chunk_num: {total_chunk_num}") + # t_stamp = time.time() + + speech_chunk = self.audio_cache[start_idx:end_idx] + + # TODO: exceptions processes + try: + res = self.asr_model.generate(input=speech_chunk, cache=self.asr_cache, is_final=is_end, chunk_size=self.chunk_size, encoder_chunk_look_back=self.encoder_chunk_look_back, decoder_chunk_look_back=self.decoder_chunk_look_back) + except ValueError as e: + print(f"ValueError: {e}") + continue + if self.use_punct: + text_dict['text'].append(self.puctuation_model.process(self.text_postprecess(res[0], data_id='text'), cache=text_dict)) + else: + text_dict['text'].append(self.text_postprecess(res[0], data_id='text')) + + + # print(f"each chunk time: {time.time()-t_stamp}") + + if is_end: + self.audio_cache = None + self.asr_cache = {} + else: + if end_idx: + self.audio_cache = self.audio_cache[end_idx:] # cut the processed part from audio_cache + text_dict['is_end'] = is_end + + if self.use_punct and is_end: + text_dict['text'].append(self.puctuation_model.process('#', cache=text_dict).replace('#', '')) + + # print(f"text_dict: {text_dict}") + return text_dict \ No newline at end of file diff --git a/takway/stt/punctuation_utils.py b/takway/stt/punctuation_utils.py new file mode 100644 index 0000000..9e038e0 --- /dev/null +++ b/takway/stt/punctuation_utils.py @@ -0,0 +1,119 @@ +from funasr import AutoModel +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +PUNCTUATION_MARK = [",", ".", "?", "!", ",", "。", "?", "!"] +""" +FUNASR + 模型大小: 1G + 效果: 较好 + 输入类型: 仅支持字符串不支持list, 输入list会将list视为彼此独立的字符串处理 +""" +FUNASR = { + "model_type": "funasr", + "model_path": "ct-punc", + "model_revision": "v2.0.4" +} +""" +CTTRANSFORMER + 模型大小: 275M + 效果:较差 + 输入类型: 支持字符串与list, 同时支持输入cache +""" +CTTRANSFORMER = { + "model_type": "ct-transformer", + "model_path": "iic/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727", + "model_revision": "v2.0.4" +} + +class Punctuation: + def __init__(self, + model_type="funasr", # funasr | ct-transformer + model_path="ct-punc", + device="cuda", + model_revision="v2.0.4", + **kwargs): + + self.model_type=model_type + self.initialize(model_type, model_path, device, model_revision, **kwargs) + + def initialize(self, + model_type, + model_path, + device, + model_revision, + **kwargs): + if model_type == 'funasr': + self.punc_model = AutoModel(model=model_path, device=device, model_revision=model_revision, **kwargs) + elif model_type == 'ct-transformer': + self.punc_model = pipeline(task=Tasks.punctuation, model=model_path, model_revision=model_revision, **kwargs) + else: + raise NotImplementedError(f"unsupported model type [{model_type}]. only [funasr|ct-transformer] expected.") + + def check_text_type(self, + text_data): + # funasr只支持单个str输入,不支持list输入,此处将list转化为字符串 + if self.model_type == 'funasr': + if isinstance(text_data, str): + pass + elif isinstance(text_data, list): + text_data = ''.join(text_data) + else: + raise TypeError(f"text must be str or list, but got {type(list)}") + # ct-transformer支持list输入 + # TODO 验证拆分字符串能否提高效率 + elif self.model_type == 'ct-transformer': + if isinstance(text_data, str): + text_data = [text_data] + elif isinstance(text_data, list): + pass + else: + raise TypeError(f"text must be str or list, but got {type(list)}") + else: + pass + return text_data + + def generate_cache(self, cache): + new_cache = {'pre_text': ""} + for text in cache['text']: + if text != '': + new_cache['pre_text'] = new_cache['pre_text']+text + return new_cache + + def process(self, + text, + append_period=False, + cache={}): + if text == '': + return '' + text = self.check_text_type(text) + if self.model_type == 'funasr': + result = self.punc_model.generate(text) + elif self.model_type == 'ct-transformer': + if cache != {}: + cache = self.generate_cache(cache) + result = self.punc_model(text, cache=cache) + punced_text = '' + for res in result: + punced_text += res['text'] + # 如果最后没有标点符号,手动加上。 + if append_period and not punced_text[-1] in PUNCTUATION_MARK: + punced_text += "。" + return punced_text + +if __name__ == "__main__": + inputs = "把字符串拆分为list只|适用于ct-transformer模型|在数据处理部分|已经把list转为单个字符串" + """ + 把字符串拆分为list只适用于ct-transformer模型, + 在数据处理部分,已经把list转为单个字符串 + """ + vads = inputs.split("|") + device = "cuda" + CTTRANSFORMER.update({"device": device}) + puct_model = Punctuation(**CTTRANSFORMER) + result = puct_model.process(vads) + print(result) + # FUNASR.update({"device":"cuda"}) + # puct_model = Punctuation(**FUNASR) + # result = puct_model.process(vads) + # print(result) \ No newline at end of file diff --git a/takway/stt/speaker_ver_utils.py b/takway/stt/speaker_ver_utils.py new file mode 100644 index 0000000..838393f --- /dev/null +++ b/takway/stt/speaker_ver_utils.py @@ -0,0 +1,86 @@ +from modelscope.pipelines import pipeline +import numpy as np +import os +import pdb +ERES2NETV2 = { + "task": 'speaker-verification', + "model_name": 'damo/speech_eres2netv2_sv_zh-cn_16k-common', + "model_revision": 'v1.0.1', + "save_embeddings": False +} + +# 保存 embedding 的路径 +DEFALUT_SAVE_PATH = r"D:\python\irving\takway_base-main\examples" + +class speaker_verfication: + def __init__(self, + task='speaker-verification', + model_name='damo/speech_eres2netv2_sv_zh-cn_16k-common', + model_revision='v1.0.1', + device="cuda", + save_embeddings=False): + self.pipeline = pipeline( + task=task, + model=model_name, + model_revision=model_revision, + device=device) + self.save_embeddings = save_embeddings + + def wav2embeddings(self, speaker_1_wav, save_path=None): + result = self.pipeline([speaker_1_wav], output_emb=True) + speaker_1_emb = result['embs'][0] + if save_path is not None: + np.save(save_path, speaker_1_emb) + return speaker_1_emb + + def _verifaction(self, speaker_1_wav, speaker_2_wav, threshold, save_path): + if not self.save_embeddings: + result = self.pipeline([speaker_1_wav, speaker_2_wav], thr=threshold) + return result["text"] + else: + result = self.pipeline([speaker_1_wav, speaker_2_wav], thr=threshold, output_emb=True) + speaker1_emb = result["embs"][0] + speaker2_emb = result["embs"][1] + np.save(os.path.join(save_path, "speaker_1.npy"), speaker1_emb) + return result['outputs']["text"] + + def _verifaction_from_embedding(self, base_emb, speaker_2_wav, threshold): + base_emb = np.load(base_emb) + result = self.pipeline([speaker_2_wav], output_emb=True) + speaker2_emb = result["embs"][0] + similarity = np.dot(base_emb, speaker2_emb) / (np.linalg.norm(base_emb) * np.linalg.norm(speaker2_emb)) + if similarity > threshold: + return "yes" + else: + return "no" + + def verfication(self, + base_emb=None, + speaker_1_wav=None, + speaker_2_wav=None, + threshold=0.333, + save_path=None): + if base_emb is not None and speaker_1_wav is not None: + raise ValueError("Only need one of them, base_emb or speaker_1_wav") + if base_emb is not None and speaker_2_wav is not None: + return self._verifaction_from_embedding(base_emb, speaker_2_wav, threshold) + elif speaker_1_wav is not None and speaker_2_wav is not None: + return self._verifaction(speaker_1_wav, speaker_2_wav, threshold, save_path) + else: + raise NotImplementedError + +if __name__ == '__main__': + verifier = speaker_verfication(**ERES2NETV2) + + verifier = speaker_verfication(save_embeddings=False) + result = verifier.verfication(base_emb=None, speaker_1_wav=r"C:\Users\bing\Downloads\speaker1_a_cn_16k.wav", + speaker_2_wav=r"C:\Users\bing\Downloads\speaker2_a_cn_16k.wav", + threshold=0.333, + save_path=r"D:\python\irving\takway_base-main\savePath" + ) + print("---") + print(result) + print(verifier.verfication(r"D:\python\irving\takway_base-main\savePath\speaker_1.npy", + speaker_2_wav=r"C:\Users\bing\Downloads\speaker1_b_cn_16k.wav", + threshold=0.333, + )) \ No newline at end of file diff --git a/takway/stt/vosk_utils.py b/takway/stt/vosk_utils.py new file mode 100644 index 0000000..b67cfa5 --- /dev/null +++ b/takway/stt/vosk_utils.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# ####################################################### # +# VOSKAutoSpeechRecognizer +# ####################################################### # +import json +import wave +import io +import os +from vosk import Model, KaldiRecognizer, SetLogLevel +from .base_stt import STTBase +from ..common_utils import decode_str2bytes + +class VOSKAutoSpeechRecognizer(STTBase): + def __init__(self, model_path="vosk-model-small-cn-0.22", RATE=16000, cfg_path=None, efficent_mode=True, debug=False): + super().__init__(self, model_path=model_path, RATE=RATE, cfg_path=cfg_path, debug=debug) + self.asr_model = AutoModel(model="paraformer-zh-streaming") + + self.apply_asr_config(self.asr_cfg) + + def recognize_keywords(self, audio_data, partial_size=None, queue=None): + """recognize keywords in audio data""" + audio_data = self.check_audio_type(audio_data) + if partial_size is None: + rec_result = self.recognize(audio_data, queue) + rec_text = self.result_postprecess(rec_result) + else: + rec_result = self.partial_recognize(audio_data, partial_size, queue) + rec_text = self.result_postprecess(rec_result, 'partial') + print(f"rec_text: {rec_text}") + if rec_text != '': + print(f"rec_text: {rec_text}") + if any(keyword in rec_text for keyword in self.keywords): + print("Keyword detected.") + return True, rec_text + else: + return False, None + + def recognize(self, audio_data, queue=None): + """recognize audio data to text""" + audio_data = self.check_audio_type(audio_data) + self.asr.AcceptWaveform(audio_data) + result = json.loads(self.asr.FinalResult()) + # TODO: put result to queue + return result + + def partial_recognize(self, audio_data, partial_size=1024, queue=None): + """recognize partial result""" + audio_data = self.check_audio_type(audio_data) + text_dict = dict( + text=[], + partial=[], + final=[], + is_end=False) + # 逐个分割音频数据进行识别 + for i in range(0, len(audio_data), partial_size): + # print(f"partial data: {i} - {i+partial_size}") + data = audio_data[i:i+partial_size] + if len(data) == 0: + break + if self.asr.AcceptWaveform(data): + result = json.loads(self.asr.Result()) + if result['text'] != '': + text_dict['text'].append(result['text']) + if queue is not None: + queue.put(('stt_info', text_dict)) + # print(f"text result: {result}") + else: + result = json.loads(self.asr.PartialResult()) + if result['partial'] != '': + # text_dict['partial'].append(result['partial']) + text_dict['partial'] = [result['partial']] + if queue is not None: + queue.put(('stt_info', text_dict)) + # print(f"partial result: {result}") + + # final recognize + final_result = json.loads(self.asr.FinalResult()) + if final_result['text'] != '': + text_dict['final'].append(final_result['text']) + text_dict['text'].append(final_result['text']) + + text_dict['is_end'] = True + + print(f"final dict: {text_dict}") + if queue is not None: + queue.put(('stt_info', text_dict)) + return text_dict + + +if __name__ == "__main__": + ''' + wav_file_path = "recording.wav" + + # You can set log level to -1 to disable debug messages + SetLogLevel(0) + + model = Model(model_path="vosk-model-small-cn-0.22") + + # 调用函数进行录音 + # record_audio(wav_file_path) + data = record_audio() + + # 调用函数进行音频转写 + result = audio_to_text(data, model) + + print("-------------") + print(result) + ''' + from takway.audio_utils import Recorder + rec = Recorder() + + return_type = 'bytes' + data = rec.record(return_type) + print(type(data)) + + asr = AutoSpeechRecognizer() + # asr.recognize(data) + asr.add_keyword("你好") + asr.recognize_keywords(data) \ No newline at end of file