Source code for lmp.tknzr._char

"""Character tokenizer class.

Attributes
----------
SP_TKS_PTTN: typing.Final[re.Pattern]
  Special tokens matching pattern.
"""

import re
from typing import ClassVar, Final, List

from lmp.tknzr._base import BaseTknzr
from lmp.vars import SP_TKS

SP_TKS_PTTN: Final[re.Pattern] = re.compile('(' + '|'.join(map(re.escape, SP_TKS)) + ')')


[docs]class CharTknzr(BaseTknzr):
  """Character tokenizer class.

  Tokenize text into list of unicode characters.

  Parameters
  ----------
  is_uncased: bool, default: False
    Set to ``True`` to convert text into lowercase.
    Mainly used by :py:meth:`~norm`.
  max_vocab: int, default: -1
    Tokenizer's maximum vocabulary size.
    Set to ``-1`` to include as many unicode characters in vocabulary as possible.
    Mainly used by :py:meth:`~build_vocab`.
  min_count: int, default: 0
    Minimum character occurrence counts.
    Unicode characters have occurrence counts less than ``min_count`` will not be added to tokenizer's vocabulary.
    Mainly used by :py:meth:`~build_vocab`.
  kwargs: typing.Any, optional
    Useless parameter.
    Intently left for subclasses inheritance.

  Attributes
  ----------
  id2tk: dict[int, str]
    Character-to-id inverse lookup table.
  is_uncased: bool
    Convert text into lowercase if set to ``True``.
  max_vocab: int
    Tokenizer's maximum vocabulary size.
  min_count: int
    Minimum character occurrence counts.
  tk2id: dict[str, int]
    Character-to-id lookup table.
  tknzr_name: typing.ClassVar[str]
    CLI name of character tokenizer is ``character``.

  See Also
  --------
  :doc:`lmp.tknzr </tknzr/index>`
    All available tokenizers.

  Examples
  --------
  >>> from lmp.tknzr import CharTknzr
  >>> tknzr = CharTknzr()
  >>> assert tknzr.tknz('abc') == ['a', 'b', 'c']
  >>> assert tknzr.dtknz(['a', 'b', 'c']) == 'abc'
  """

  tknzr_name: ClassVar[str] = 'character'

[docs]  def tknz(self, txt: str) -> List[str]:
    """Convert text into character list.

    Text is first normalized then splitted into unicode character list.
    Each special token is treated as an unicode character and thus is not splitted.

    Parameters
    ----------
    txt: str
      Text to be tokenized.

    Returns
    -------
    list[str]
      List of normalized unicode characters.

    See Also
    --------
    ~dtknz
      Convert unicode character list back to text.
    ~norm
      Text normalization.

    Examples
    --------
    >>> from lmp.tknzr import CharTknzr
    >>> tknzr = CharTknzr()
    >>> assert tknzr.tknz('abc') == ['a', 'b', 'c']
    >>> assert tknzr.tknz('abc def') == ['a', 'b', 'c', ' ', 'd', 'e', 'f']
    """
    # Perform normalization.
    txt = self.norm(txt)

    # Perform tokenization.
    tks = []
    while txt:
      match = SP_TKS_PTTN.match(txt)
      if match:
        tks.append(match.group(1))
        txt = txt[len(tks[-1]):]
      else:
        tks.append(txt[0])
        txt = txt[1:]

    return tks

[docs]  def dtknz(self, tks: List[str]) -> str:
    """Convert unicode character list back to text.

    Unicode character list is joined without whitespaces.
    Returned text is normalized.

    Parameters
    ----------
    tks: list[str]
      Unicode character list to be joint.

    Returns
    -------
    str
      Normalized text without additional whitespaces other than the ones in the unicode character list.

    See Also
    --------
    ~tknz
      Convert text into unicode characters.
    ~norm
      Text normalization.

    Examples
    --------
    >>> from lmp.tknzr import CharTknzr
    >>> tknzr = CharTknzr()
    >>> assert tknzr.dtknz(['a', 'b', 'c']) == 'abc'
    >>> assert tknzr.dtknz(['a', 'b', 'c', ' ', 'd', 'e', 'f']) == 'abc def'
    """
    # First perform detokenization, then do normalization.
    # Order of these operation does not affect the output.
    return self.norm(''.join(tks))