180 lines
4.4 KiB
Python
180 lines
4.4 KiB
Python
import string
|
|
from collections import OrderedDict
|
|
from typing import Dict, List, Union
|
|
|
|
PRESERVE_ORDER_KEY = "preserve_order"
|
|
EOS = "<EOS>"
|
|
|
|
|
|
class TokenParser:
|
|
"""
|
|
Parses tokenized/classified text, e.g. 'tokens { money { integer: "20" currency: "$" } } tokens { name: "left"}'
|
|
|
|
Args
|
|
text: tokenized text
|
|
"""
|
|
|
|
def __call__(self, text):
|
|
"""
|
|
Setup function
|
|
|
|
Args:
|
|
text: text to be parsed
|
|
|
|
"""
|
|
self.text = text
|
|
self.len_text = len(text)
|
|
self.char = text[0] # cannot handle empty string
|
|
self.index = 0
|
|
|
|
def parse(self) -> List[dict]:
|
|
"""
|
|
Main function. Implements grammar:
|
|
A -> space F space F space F ... space
|
|
|
|
Returns list of dictionaries
|
|
"""
|
|
l = list()
|
|
while self.parse_ws():
|
|
token = self.parse_token()
|
|
if not token:
|
|
break
|
|
l.append(token)
|
|
return l
|
|
|
|
def parse_token(self) -> Dict[str, Union[str, dict]]:
|
|
"""
|
|
Implements grammar:
|
|
F-> no_space KG no_space
|
|
|
|
Returns: K, G as dictionary values
|
|
"""
|
|
d = OrderedDict()
|
|
key = self.parse_string_key()
|
|
if key is None:
|
|
return None
|
|
self.parse_ws()
|
|
if key == PRESERVE_ORDER_KEY:
|
|
self.parse_char(":")
|
|
self.parse_ws()
|
|
value = self.parse_chars("true")
|
|
else:
|
|
value = self.parse_token_value()
|
|
|
|
d[key] = value
|
|
return d
|
|
|
|
def parse_token_value(self) -> Union[str, dict]:
|
|
"""
|
|
Implements grammar:
|
|
G-> no_space :"VALUE" no_space | no_space {A} no_space
|
|
|
|
Returns: string or dictionary
|
|
"""
|
|
if self.char == ":":
|
|
self.parse_char(":")
|
|
self.parse_ws()
|
|
self.parse_char('"')
|
|
value_string = self.parse_string_value()
|
|
self.parse_char('"')
|
|
return value_string
|
|
elif self.char == "{":
|
|
d = OrderedDict()
|
|
self.parse_char("{")
|
|
list_token_dicts = self.parse()
|
|
# flatten tokens
|
|
for tok_dict in list_token_dicts:
|
|
for k, v in tok_dict.items():
|
|
d[k] = v
|
|
self.parse_char("}")
|
|
return d
|
|
else:
|
|
raise ValueError()
|
|
|
|
def parse_char(self, exp) -> bool:
|
|
"""
|
|
Parses character
|
|
|
|
Args:
|
|
exp: character to read in
|
|
|
|
Returns true if successful
|
|
"""
|
|
assert self.char == exp
|
|
self.read()
|
|
return True
|
|
|
|
def parse_chars(self, exp) -> bool:
|
|
"""
|
|
Parses characters
|
|
|
|
Args:
|
|
exp: characters to read in
|
|
|
|
Returns true if successful
|
|
"""
|
|
ok = False
|
|
for x in exp:
|
|
ok |= self.parse_char(x)
|
|
return ok
|
|
|
|
def parse_string_key(self) -> str:
|
|
"""
|
|
Parses string key, can only contain ascii and '_' characters
|
|
|
|
Returns parsed string key
|
|
"""
|
|
assert self.char not in string.whitespace and self.char != EOS
|
|
incl_criterium = string.ascii_letters + "_"
|
|
l = []
|
|
while self.char in incl_criterium:
|
|
l.append(self.char)
|
|
if not self.read():
|
|
raise ValueError()
|
|
|
|
if not l:
|
|
return None
|
|
return "".join(l)
|
|
|
|
def parse_string_value(self) -> str:
|
|
"""
|
|
Parses string value, ends with quote followed by space
|
|
|
|
Returns parsed string value
|
|
"""
|
|
# assert self.char not in string.whitespace and self.char != EOS
|
|
assert self.char != EOS
|
|
l = []
|
|
while self.char != '"' or self.text[self.index + 1] != " ":
|
|
l.append(self.char)
|
|
if not self.read():
|
|
raise ValueError()
|
|
|
|
if not l:
|
|
return None
|
|
return "".join(l)
|
|
|
|
def parse_ws(self):
|
|
"""
|
|
Deletes whitespaces.
|
|
|
|
Returns true if not EOS after parsing
|
|
"""
|
|
not_eos = self.char != EOS
|
|
while not_eos and self.char == " ":
|
|
not_eos = self.read()
|
|
return not_eos
|
|
|
|
def read(self):
|
|
"""
|
|
Reads in next char.
|
|
|
|
Returns true if not EOS
|
|
"""
|
|
if self.index < self.len_text - 1: # should be unique
|
|
self.index += 1
|
|
self.char = self.text[self.index]
|
|
return True
|
|
self.char = EOS
|
|
return False
|