FunASR/fun_text_processing/text_normalization/ru/alphabet.py

51 lines
1.4 KiB
Python
Raw Normal View History

2024-05-18 15:50:56 +08:00
# Copyright 2017 Google Inc.
# Adapted from https://github.com/google/TextNormalizationCoveringGrammars
# Russian minimally supervised number grammar.
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NON_BREAKING_SPACE,
DAMO_SPACE,
)
from fun_text_processing.text_normalization.ru.utils import get_abs_path
RU_LOWER_ALPHA = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
RU_UPPER_ALPHA = RU_LOWER_ALPHA.upper()
RU_LOWER_ALPHA = pynini.union(*RU_LOWER_ALPHA).optimize()
RU_UPPER_ALPHA = pynini.union(*RU_UPPER_ALPHA).optimize()
RU_ALPHA = (RU_LOWER_ALPHA | RU_UPPER_ALPHA).optimize()
RU_STRESSED_MAP = [
("А́", "А'"),
("Е́", "Е'"),
("Ё́", "Е'"),
("И́", "И'"),
("О́", "О'"),
("У́", "У'"),
("Ы́", "Ы'"),
("Э́", "Э'"),
("Ю́", "Ю'"),
("Я́", "Я'"),
("а́", "а'"),
("е́", "е'"),
("ё́", "е'"),
("и́", "и'"),
("о́", "о'"),
("у́", "у'"),
("ы́", "ы'"),
("э́", "э'"),
("ю́", "ю'"),
("я́", "я'"),
("ё", "е"),
("Ё", "Е"),
]
REWRITE_STRESSED = pynini.closure(
pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA
).optimize()
TO_CYRILLIC = pynini.string_file(get_abs_path("data/latin_to_cyrillic.tsv")).optimize()
TO_LATIN = pynini.invert(TO_CYRILLIC).optimize()
RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, DAMO_SPACE, DAMO_NON_BREAKING_SPACE).optimize()