Source code for lmp.tknzr._char

"""Character tokenizer class.

Attributes
----------
SP_TKS_PTTN: typing.Final[re.Pattern]
  Special tokens matching pattern.
"""

import re
from typing import ClassVar, Final, List

from lmp.tknzr._base import BaseTknzr
from lmp.vars import SP_TKS

SP_TKS_PTTN: Final[re.Pattern] = re.compile('(' + '|'.join(map(re.escape, SP_TKS)) + ')')


[docs]class CharTknzr(BaseTknzr): """Character tokenizer class. Tokenize text into list of unicode characters. Parameters ---------- is_uncased: bool, default: False Set to ``True`` to convert text into lowercase. Mainly used by :py:meth:`~norm`. max_vocab: int, default: -1 Tokenizer's maximum vocabulary size. Set to ``-1`` to include as many unicode characters in vocabulary as possible. Mainly used by :py:meth:`~build_vocab`. min_count: int, default: 0 Minimum character occurrence counts. Unicode characters have occurrence counts less than ``min_count`` will not be added to tokenizer's vocabulary. Mainly used by :py:meth:`~build_vocab`. kwargs: typing.Any, optional Useless parameter. Intently left for subclasses inheritance. Attributes ---------- id2tk: dict[int, str] Character-to-id inverse lookup table. is_uncased: bool Convert text into lowercase if set to ``True``. max_vocab: int Tokenizer's maximum vocabulary size. min_count: int Minimum character occurrence counts. tk2id: dict[str, int] Character-to-id lookup table. tknzr_name: typing.ClassVar[str] CLI name of character tokenizer is ``character``. See Also -------- :doc:`lmp.tknzr </tknzr/index>` All available tokenizers. Examples -------- >>> from lmp.tknzr import CharTknzr >>> tknzr = CharTknzr() >>> assert tknzr.tknz('abc') == ['a', 'b', 'c'] >>> assert tknzr.dtknz(['a', 'b', 'c']) == 'abc' """ tknzr_name: ClassVar[str] = 'character'
[docs] def tknz(self, txt: str) -> List[str]: """Convert text into character list. Text is first normalized then splitted into unicode character list. Each special token is treated as an unicode character and thus is not splitted. Parameters ---------- txt: str Text to be tokenized. Returns ------- list[str] List of normalized unicode characters. See Also -------- ~dtknz Convert unicode character list back to text. ~norm Text normalization. Examples -------- >>> from lmp.tknzr import CharTknzr >>> tknzr = CharTknzr() >>> assert tknzr.tknz('abc') == ['a', 'b', 'c'] >>> assert tknzr.tknz('abc def') == ['a', 'b', 'c', ' ', 'd', 'e', 'f'] """ # Perform normalization. txt = self.norm(txt) # Perform tokenization. tks = [] while txt: match = SP_TKS_PTTN.match(txt) if match: tks.append(match.group(1)) txt = txt[len(tks[-1]):] else: tks.append(txt[0]) txt = txt[1:] return tks
[docs] def dtknz(self, tks: List[str]) -> str: """Convert unicode character list back to text. Unicode character list is joined without whitespaces. Returned text is normalized. Parameters ---------- tks: list[str] Unicode character list to be joint. Returns ------- str Normalized text without additional whitespaces other than the ones in the unicode character list. See Also -------- ~tknz Convert text into unicode characters. ~norm Text normalization. Examples -------- >>> from lmp.tknzr import CharTknzr >>> tknzr = CharTknzr() >>> assert tknzr.dtknz(['a', 'b', 'c']) == 'abc' >>> assert tknzr.dtknz(['a', 'b', 'c', ' ', 'd', 'e', 'f']) == 'abc def' """ # First perform detokenization, then do normalization. # Order of these operation does not affect the output. return self.norm(''.join(tks))