FunASR/fun_text_processing/text_normalization/token_parser.py

import string
from collections import OrderedDict
from typing import Dict, List, Union

PRESERVE_ORDER_KEY = "preserve_order"
EOS = "<EOS>"


class TokenParser:
    """
    Parses tokenized/classified text, e.g. 'tokens { money { integer: "20" currency: "$" } } tokens { name: "left"}'

    Args
        text: tokenized text
    """

    def __call__(self, text):
        """
        Setup function

        Args:
            text: text to be parsed

        """
        self.text = text
        self.len_text = len(text)
        self.char = text[0]  # cannot handle empty string
        self.index = 0

    def parse(self) -> List[dict]:
        """
        Main function. Implements grammar:
        A -> space F space F space F ... space

        Returns list of dictionaries
        """
        l = list()
        while self.parse_ws():
            token = self.parse_token()
            if not token:
                break
            l.append(token)
        return l

    def parse_token(self) -> Dict[str, Union[str, dict]]:
        """
        Implements grammar:
        F-> no_space KG no_space

        Returns: K, G as dictionary values
        """
        d = OrderedDict()
        key = self.parse_string_key()
        if key is None:
            return None
        self.parse_ws()
        if key == PRESERVE_ORDER_KEY:
            self.parse_char(":")
            self.parse_ws()
            value = self.parse_chars("true")
        else:
            value = self.parse_token_value()

        d[key] = value
        return d

    def parse_token_value(self) -> Union[str, dict]:
        """
        Implements grammar:
        G-> no_space :"VALUE" no_space | no_space {A} no_space

        Returns: string or dictionary
        """
        if self.char == ":":
            self.parse_char(":")
            self.parse_ws()
            self.parse_char('"')
            value_string = self.parse_string_value()
            self.parse_char('"')
            return value_string
        elif self.char == "{":
            d = OrderedDict()
            self.parse_char("{")
            list_token_dicts = self.parse()
            # flatten tokens
            for tok_dict in list_token_dicts:
                for k, v in tok_dict.items():
                    d[k] = v
            self.parse_char("}")
            return d
        else:
            raise ValueError()

    def parse_char(self, exp) -> bool:
        """
        Parses character

        Args:
            exp: character to read in

        Returns true if successful
        """
        assert self.char == exp
        self.read()
        return True

    def parse_chars(self, exp) -> bool:
        """
        Parses characters

        Args:
            exp: characters to read in

        Returns true if successful
        """
        ok = False
        for x in exp:
            ok |= self.parse_char(x)
        return ok

    def parse_string_key(self) -> str:
        """
        Parses string key, can only contain ascii and '_' characters

        Returns parsed string key
        """
        assert self.char not in string.whitespace and self.char != EOS
        incl_criterium = string.ascii_letters + "_"
        l = []
        while self.char in incl_criterium:
            l.append(self.char)
            if not self.read():
                raise ValueError()

        if not l:
            return None
        return "".join(l)

    def parse_string_value(self) -> str:
        """
        Parses string value, ends with quote followed by space

        Returns parsed string value
        """
        # assert self.char not in string.whitespace and self.char != EOS
        assert self.char != EOS
        l = []
        while self.char != '"' or self.text[self.index + 1] != " ":
            l.append(self.char)
            if not self.read():
                raise ValueError()

        if not l:
            return None
        return "".join(l)

    def parse_ws(self):
        """
        Deletes whitespaces.

        Returns true if not EOS after parsing
        """
        not_eos = self.char != EOS
        while not_eos and self.char == " ":
            not_eos = self.read()
        return not_eos

    def read(self):
        """
        Reads in next char.

        Returns true if not EOS
        """
        if self.index < self.len_text - 1:  # should be unique
            self.index += 1
            self.char = self.text[self.index]
            return True
        self.char = EOS
        return False
first commit for takway.ai 2024-05-18 15:50:56 +08:00			`import string`
			`from collections import OrderedDict`
			`from typing import Dict, List, Union`

			`PRESERVE_ORDER_KEY = "preserve_order"`
			`EOS = "<EOS>"`


			`class TokenParser:`
			`"""`
			`Parses tokenized/classified text, e.g. 'tokens { money { integer: "20" currency: "$" } } tokens { name: "left"}'`

			`Args`
			`text: tokenized text`
			`"""`

			`def __call__(self, text):`
			`"""`
			`Setup function`

			`Args:`
			`text: text to be parsed`

			`"""`
			`self.text = text`
			`self.len_text = len(text)`
			`self.char = text[0] # cannot handle empty string`
			`self.index = 0`

			`def parse(self) -> List[dict]:`
			`"""`
			`Main function. Implements grammar:`
			`A -> space F space F space F ... space`

			`Returns list of dictionaries`
			`"""`
			`l = list()`
			`while self.parse_ws():`
			`token = self.parse_token()`
			`if not token:`
			`break`
			`l.append(token)`
			`return l`

			`def parse_token(self) -> Dict[str, Union[str, dict]]:`
			`"""`
			`Implements grammar:`
			`F-> no_space KG no_space`

			`Returns: K, G as dictionary values`
			`"""`
			`d = OrderedDict()`
			`key = self.parse_string_key()`
			`if key is None:`
			`return None`
			`self.parse_ws()`
			`if key == PRESERVE_ORDER_KEY:`
			`self.parse_char(":")`
			`self.parse_ws()`
			`value = self.parse_chars("true")`
			`else:`
			`value = self.parse_token_value()`

			`d[key] = value`
			`return d`

			`def parse_token_value(self) -> Union[str, dict]:`
			`"""`
			`Implements grammar:`
			`G-> no_space :"VALUE" no_space \| no_space {A} no_space`

			`Returns: string or dictionary`
			`"""`
			`if self.char == ":":`
			`self.parse_char(":")`
			`self.parse_ws()`
			`self.parse_char('"')`
			`value_string = self.parse_string_value()`
			`self.parse_char('"')`
			`return value_string`
			`elif self.char == "{":`
			`d = OrderedDict()`
			`self.parse_char("{")`
			`list_token_dicts = self.parse()`
			`# flatten tokens`
			`for tok_dict in list_token_dicts:`
			`for k, v in tok_dict.items():`
			`d[k] = v`
			`self.parse_char("}")`
			`return d`
			`else:`
			`raise ValueError()`

			`def parse_char(self, exp) -> bool:`
			`"""`
			`Parses character`

			`Args:`
			`exp: character to read in`

			`Returns true if successful`
			`"""`
			`assert self.char == exp`
			`self.read()`
			`return True`

			`def parse_chars(self, exp) -> bool:`
			`"""`
			`Parses characters`

			`Args:`
			`exp: characters to read in`

			`Returns true if successful`
			`"""`
			`ok = False`
			`for x in exp:`
			`ok \|= self.parse_char(x)`
			`return ok`

			`def parse_string_key(self) -> str:`
			`"""`
			`Parses string key, can only contain ascii and '_' characters`

			`Returns parsed string key`
			`"""`
			`assert self.char not in string.whitespace and self.char != EOS`
			`incl_criterium = string.ascii_letters + "_"`
			`l = []`
			`while self.char in incl_criterium:`
			`l.append(self.char)`
			`if not self.read():`
			`raise ValueError()`

			`if not l:`
			`return None`
			`return "".join(l)`

			`def parse_string_value(self) -> str:`
			`"""`
			`Parses string value, ends with quote followed by space`

			`Returns parsed string value`
			`"""`
			`# assert self.char not in string.whitespace and self.char != EOS`
			`assert self.char != EOS`
			`l = []`
			`while self.char != '"' or self.text[self.index + 1] != " ":`
			`l.append(self.char)`
			`if not self.read():`
			`raise ValueError()`

			`if not l:`
			`return None`
			`return "".join(l)`

			`def parse_ws(self):`
			`"""`
			`Deletes whitespaces.`

			`Returns true if not EOS after parsing`
			`"""`
			`not_eos = self.char != EOS`
			`while not_eos and self.char == " ":`
			`not_eos = self.read()`
			`return not_eos`

			`def read(self):`
			`"""`
			`Reads in next char.`

			`Returns true if not EOS`
			`"""`
			`if self.index < self.len_text - 1: # should be unique`
			`self.index += 1`
			`self.char = self.text[self.index]`
			`return True`
			`self.char = EOS`
			`return False`