196 lines
7.8 KiB
Python
196 lines
7.8 KiB
Python
import pynini
|
|
from fun_text_processing.text_normalization.en.graph_utils import (
|
|
DAMO_CHAR,
|
|
DAMO_NOT_QUOTE,
|
|
DAMO_NOT_SPACE,
|
|
DAMO_SIGMA,
|
|
DAMO_SPACE,
|
|
GraphFst,
|
|
delete_space,
|
|
insert_space,
|
|
)
|
|
from fun_text_processing.text_normalization.es.graph_utils import (
|
|
accents,
|
|
shift_cardinal_gender,
|
|
strip_cardinal_apocope,
|
|
)
|
|
from pynini.lib import pynutil
|
|
|
|
|
|
class FractionFst(GraphFst):
|
|
"""
|
|
Finite state transducer for verbalizing fraction
|
|
e.g. tokens { fraction { integer: "treinta y tres" numerator: "cuatro" denominator: "quinto" } } ->
|
|
treinta y tres y cuatro quintos
|
|
|
|
|
|
Args:
|
|
deterministic: if True will provide a single transduction option,
|
|
for False multiple transduction are generated (used for audio-based normalization)
|
|
"""
|
|
|
|
def __init__(self, deterministic: bool = True):
|
|
super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
|
|
|
|
# Derivational strings append 'avo' as a suffix. Adding space for processing aid
|
|
fraction_stem = pynutil.insert(" avo")
|
|
plural = pynutil.insert("s")
|
|
conjunction = pynutil.insert(" y ")
|
|
|
|
integer = (
|
|
pynutil.delete('integer_part: "')
|
|
+ strip_cardinal_apocope(pynini.closure(DAMO_NOT_QUOTE))
|
|
+ pynutil.delete('"')
|
|
)
|
|
|
|
numerator_one = pynutil.delete('numerator: "') + pynini.accep("un") + pynutil.delete('" ')
|
|
numerator = (
|
|
pynutil.delete('numerator: "')
|
|
+ pynini.difference(pynini.closure(DAMO_NOT_QUOTE), "un")
|
|
+ pynutil.delete('" ')
|
|
)
|
|
|
|
denominator_add_stem = pynutil.delete('denominator: "') + (
|
|
pynini.closure(DAMO_NOT_QUOTE)
|
|
+ fraction_stem
|
|
+ pynutil.delete('" morphosyntactic_features: "add_root"')
|
|
)
|
|
denominator_ordinal = pynutil.delete('denominator: "') + (
|
|
pynini.closure(DAMO_NOT_QUOTE) + pynutil.delete('" morphosyntactic_features: "ordinal"')
|
|
)
|
|
denominator_cardinal = pynutil.delete('denominator: "') + (
|
|
pynini.closure(DAMO_NOT_QUOTE) + pynutil.delete('"')
|
|
)
|
|
|
|
denominator_singular = pynini.union(denominator_add_stem, denominator_ordinal)
|
|
if not deterministic:
|
|
# Occasional exceptions
|
|
denominator_singular |= denominator_add_stem @ pynini.string_map(
|
|
[("once avo", "undécimo"), ("doce avo", "duodécimo")]
|
|
)
|
|
denominator_plural = denominator_singular + plural
|
|
|
|
# Merging operations
|
|
merge = pynini.cdrewrite(
|
|
pynini.cross(" y ", "i"), "", "", DAMO_SIGMA
|
|
) # The denominator must be a single word, with the conjunction "y" replaced by i
|
|
merge @= pynini.cdrewrite(
|
|
delete_space, "", pynini.difference(DAMO_CHAR, "parte"), DAMO_SIGMA
|
|
)
|
|
|
|
# The merger can produce duplicate vowels. This is not allowed in orthography
|
|
delete_duplicates = pynini.string_map([("aa", "a"), ("oo", "o")]) # Removes vowels
|
|
delete_duplicates = pynini.cdrewrite(delete_duplicates, "", "", DAMO_SIGMA)
|
|
|
|
remove_accents = pynini.cdrewrite(
|
|
accents,
|
|
pynini.union(DAMO_SPACE, pynini.accep("[BOS]")) + pynini.closure(DAMO_NOT_SPACE),
|
|
pynini.closure(DAMO_NOT_SPACE) + pynini.union("avo", "ava", "ésimo", "ésima"),
|
|
DAMO_SIGMA,
|
|
)
|
|
merge_into_single_word = merge @ remove_accents @ delete_duplicates
|
|
|
|
fraction_default = (
|
|
numerator + delete_space + insert_space + (denominator_plural @ merge_into_single_word)
|
|
)
|
|
|
|
fraction_with_one = (
|
|
numerator_one
|
|
+ delete_space
|
|
+ insert_space
|
|
+ (denominator_singular @ merge_into_single_word)
|
|
)
|
|
|
|
fraction_with_cardinal = strip_cardinal_apocope(numerator | numerator_one)
|
|
fraction_with_cardinal += (
|
|
delete_space + pynutil.insert(" sobre ") + strip_cardinal_apocope(denominator_cardinal)
|
|
)
|
|
|
|
if not deterministic:
|
|
# There is an alternative rendering where ordinals act as adjectives for 'parte'. This requires use of the feminine
|
|
# Other rules will manage use of "un" at end, so just worry about endings
|
|
exceptions = pynini.string_map([("tercia", "tercera")])
|
|
apply_exceptions = pynini.cdrewrite(exceptions, "", "", DAMO_SIGMA)
|
|
vowel_change = pynini.cdrewrite(
|
|
pynini.cross("o", "a"), "", pynini.accep("[EOS]"), DAMO_SIGMA
|
|
)
|
|
|
|
denominator_singular_fem = (
|
|
shift_cardinal_gender(denominator_singular) @ vowel_change @ apply_exceptions
|
|
)
|
|
denominator_plural_fem = denominator_singular_fem + plural
|
|
|
|
numerator_one_fem = shift_cardinal_gender(numerator_one)
|
|
numerator_fem = shift_cardinal_gender(numerator)
|
|
|
|
fraction_with_cardinal |= (
|
|
(numerator_one_fem | numerator_fem)
|
|
+ delete_space
|
|
+ pynutil.insert(" sobre ")
|
|
+ shift_cardinal_gender(denominator_cardinal)
|
|
)
|
|
|
|
# Still need to manage stems
|
|
merge_stem = pynini.cdrewrite(
|
|
delete_space, "", pynini.union("avo", "ava", "avos", "avas"), DAMO_SIGMA
|
|
) # For managing alternative spacing
|
|
merge_stem @= remove_accents @ delete_duplicates
|
|
|
|
fraction_with_one_fem = numerator_one_fem + delete_space + insert_space
|
|
fraction_with_one_fem += pynini.union(
|
|
denominator_singular_fem @ merge_stem,
|
|
denominator_singular_fem @ merge_into_single_word,
|
|
) # Both forms exists
|
|
fraction_with_one_fem += pynutil.insert(" parte")
|
|
fraction_with_one_fem @= pynini.cdrewrite(
|
|
pynini.cross("una media", "media"), "", "", DAMO_SIGMA
|
|
) # "media" not "una media"
|
|
|
|
fraction_default_fem = numerator_fem + delete_space + insert_space
|
|
fraction_default_fem += pynini.union(
|
|
denominator_plural_fem @ merge_stem, denominator_plural_fem @ merge_into_single_word
|
|
)
|
|
fraction_default_fem += pynutil.insert(" partes")
|
|
|
|
fraction_default |= (
|
|
numerator + delete_space + insert_space + denominator_plural @ merge_stem
|
|
) # Case of no merger
|
|
fraction_default |= fraction_default_fem
|
|
|
|
fraction_with_one |= (
|
|
numerator_one + delete_space + insert_space + denominator_singular @ merge_stem
|
|
)
|
|
fraction_with_one |= fraction_with_one_fem
|
|
|
|
fraction_with_one @= pynini.cdrewrite(
|
|
pynini.cross("un medio", "medio"), "", "", DAMO_SIGMA
|
|
) # "medio" not "un medio"
|
|
|
|
fraction = fraction_with_one | fraction_default | fraction_with_cardinal
|
|
graph_masc = pynini.closure(integer + delete_space + conjunction, 0, 1) + fraction
|
|
|
|
# Manage cases of fem gender (only shows on integer except for "medio")
|
|
integer_fem = shift_cardinal_gender(integer)
|
|
fraction_default |= (
|
|
shift_cardinal_gender(numerator)
|
|
+ delete_space
|
|
+ insert_space
|
|
+ (denominator_plural @ pynini.cross("medios", "medias"))
|
|
)
|
|
fraction_with_one |= (
|
|
pynutil.delete(numerator_one)
|
|
+ delete_space
|
|
+ (denominator_singular @ pynini.cross("medio", "media"))
|
|
)
|
|
|
|
fraction_fem = fraction_with_one | fraction_default | fraction_with_cardinal
|
|
graph_fem = pynini.closure(integer_fem + delete_space + conjunction, 0, 1) + fraction_fem
|
|
|
|
self.graph_masc = pynini.optimize(graph_masc)
|
|
self.graph_fem = pynini.optimize(graph_fem)
|
|
|
|
self.graph = graph_masc | graph_fem
|
|
|
|
delete_tokens = self.delete_tokens(self.graph)
|
|
self.fst = delete_tokens.optimize()
|