FunASR/fun_text_processing/text_normalization/en/verbalizers/fraction.py

95 lines
3.2 KiB
Python
Raw Normal View History

2024-05-18 15:50:56 +08:00
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
DAMO_SIGMA,
GraphFst,
insert_space,
)
from fun_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst
from pynini.examples import plurals
from pynini.lib import pynutil
class FractionFst(GraphFst):
"""
Finite state transducer for verbalizing fraction
e.g. tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } } ->
twenty three and four fifth
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def __init__(self, deterministic: bool = True, lm: bool = False):
super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
suffix = OrdinalFst().suffix
integer = (
pynutil.delete('integer_part: "')
+ pynini.closure(DAMO_NOT_QUOTE)
+ pynutil.delete('" ')
)
denominator_one = pynini.cross('denominator: "one"', "over one")
denominator_half = pynini.cross('denominator: "two"', "half")
denominator_quarter = pynini.cross('denominator: "four"', "quarter")
denominator_rest = (
pynutil.delete('denominator: "')
+ pynini.closure(DAMO_NOT_QUOTE) @ suffix
+ pynutil.delete('"')
)
denominators = plurals._priority_union(
denominator_one,
plurals._priority_union(
denominator_half,
plurals._priority_union(denominator_quarter, denominator_rest, DAMO_SIGMA),
DAMO_SIGMA,
),
DAMO_SIGMA,
).optimize()
if not deterministic:
denominators |= (
pynutil.delete('denominator: "')
+ (pynini.accep("four") @ suffix)
+ pynutil.delete('"')
)
numerator_one = pynutil.delete('numerator: "') + pynini.accep("one") + pynutil.delete('" ')
numerator_one = numerator_one + insert_space + denominators
numerator_rest = (
pynutil.delete('numerator: "')
+ (pynini.closure(DAMO_NOT_QUOTE) - pynini.accep("one"))
+ pynutil.delete('" ')
)
numerator_rest = numerator_rest + insert_space + denominators
numerator_rest @= pynini.cdrewrite(
plurals._priority_union(
pynini.cross("half", "halves"), pynutil.insert("s"), DAMO_SIGMA
),
"",
"[EOS]",
DAMO_SIGMA,
)
graph = numerator_one | numerator_rest
conjunction = pynutil.insert("and ")
if not deterministic and not lm:
conjunction = pynini.closure(conjunction, 0, 1)
integer = pynini.closure(integer + insert_space + conjunction, 0, 1)
graph = integer + graph
graph @= pynini.cdrewrite(
pynini.cross("and one half", "and a half") | pynini.cross("over ones", "over one"),
"",
"[EOS]",
DAMO_SIGMA,
)
self.graph = graph
delete_tokens = self.delete_tokens(self.graph)
self.fst = delete_tokens.optimize()