76 lines
2.0 KiB
Python
76 lines
2.0 KiB
Python
|
import re
|
||
|
import unicodedata
|
||
|
|
||
|
import regex
|
||
|
|
||
|
# non-ASCII letters that are not separated by "NFKD" normalization
|
||
|
ADDITIONAL_DIACRITICS = {
|
||
|
"œ": "oe",
|
||
|
"Œ": "OE",
|
||
|
"ø": "o",
|
||
|
"Ø": "O",
|
||
|
"æ": "ae",
|
||
|
"Æ": "AE",
|
||
|
"ß": "ss",
|
||
|
"ẞ": "SS",
|
||
|
"đ": "d",
|
||
|
"Đ": "D",
|
||
|
"ð": "d",
|
||
|
"Ð": "D",
|
||
|
"þ": "th",
|
||
|
"Þ": "th",
|
||
|
"ł": "l",
|
||
|
"Ł": "L",
|
||
|
}
|
||
|
|
||
|
|
||
|
def remove_symbols_and_diacritics(s: str, keep=""):
|
||
|
"""
|
||
|
Replace any other markers, symbols, and punctuations with a space,
|
||
|
and drop any diacritics (category 'Mn' and some manual mappings)
|
||
|
"""
|
||
|
return "".join(
|
||
|
(
|
||
|
c
|
||
|
if c in keep
|
||
|
else (
|
||
|
ADDITIONAL_DIACRITICS[c]
|
||
|
if c in ADDITIONAL_DIACRITICS
|
||
|
else (
|
||
|
""
|
||
|
if unicodedata.category(c) == "Mn"
|
||
|
else " " if unicodedata.category(c)[0] in "MSP" else c
|
||
|
)
|
||
|
)
|
||
|
)
|
||
|
for c in unicodedata.normalize("NFKD", s)
|
||
|
)
|
||
|
|
||
|
|
||
|
def remove_symbols(s: str):
|
||
|
"""
|
||
|
Replace any other markers, symbols, punctuations with a space, keeping diacritics
|
||
|
"""
|
||
|
return "".join(
|
||
|
" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
|
||
|
)
|
||
|
|
||
|
|
||
|
class BasicTextNormalizer:
|
||
|
def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
|
||
|
self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
|
||
|
self.split_letters = split_letters
|
||
|
|
||
|
def __call__(self, s: str):
|
||
|
s = s.lower()
|
||
|
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
|
||
|
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
|
||
|
s = self.clean(s).lower()
|
||
|
|
||
|
if self.split_letters:
|
||
|
s = " ".join(regex.findall(r"\X", s, regex.U))
|
||
|
|
||
|
s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space
|
||
|
|
||
|
return s
|