Source code for lmp.tknzr._ws

"""Whitespace tokenizer class.

Attributes
----------
SPLIT_PTTN: typing.Final[re.Pattern]
  Special tokens and whitespaces matching pattern.
"""

import re
from typing import ClassVar, Final, List

from lmp.tknzr._base import BaseTknzr
from lmp.vars import SP_TKS

SPLIT_PTTN: Final[re.Pattern] = re.compile('(' + '|'.join(map(re.escape, SP_TKS)) + r'|\s+' + ')')


[docs]class WsTknzr(BaseTknzr):
  """Whitespace tokenizer class.

  Tokenize text into whitespaces seperated tokens.
  No whitespace will be preserved after tokenization.

  Parameters
  ----------
  is_uncased: bool, default: False
    Set to ``True`` to convert text into lowercase.
    Mainly used by :py:meth:`~norm`.
  max_vocab: int, default: -1
    Tokenizer's maximum vocabulary size.
    Set to ``-1`` to include as many tokens in vocabulary as possible.
    Mainly used by :py:meth:`~build_vocab`.
  min_count: int, default: 0
    Minimum token occurrence counts.
    Tokens have occurrence counts less than ``min_count`` will not be added to tokenizer's vocabulary.
    Mainly used by :py:meth:`~build_vocab`.
  kwargs: typing.Any, optional
    Useless parameter.
    Intently left for subclasses inheritance.

  Attributes
  ----------
  id2tk: dict[int, str]
    Token-to-id inverse lookup table.
  is_uncased: bool
    Convert text into lowercase if set to ``True``.
  max_vocab: int
    Tokenizer's maximum vocabulary size.
  min_count: int
    Minimum token occurrence counts.
  tk2id: dict[str, int]
    Token-to-id lookup table.
  tknzr_name: typing.ClassVar[str]
    CLI name of whitespace tokenizer is ``whitespace``.

  See Also
  --------
  :doc:`lmp.tknzr </tknzr/index>`
    All available tokenizers.

  Examples
  --------
  >>> from lmp.tknzr import WsTknzr
  >>> tknzr = WsTknzr()
  >>> assert tknzr.tknz('a b c') == ['a', 'b', 'c']
  >>> assert tknzr.dtknz(['a', 'b', 'c']) == 'a b c'
  """

  tknzr_name: ClassVar[str] = 'whitespace'

[docs]  def tknz(self, txt: str) -> List[str]:
    """Split text on whitespaces.

    Text is first normalized then splited on whitespaces.

    Parameters
    ----------
    txt: str
      Text to be tokenized.

    Returns
    -------
    list[str]
      List of normalized whitespace-separated tokens.

    See Also
    --------
    ~dtknz
      Join text with whitespaces.
    ~norm
      Text normalization.

    Examples
    --------
    >>> from lmp.tknzr import WsTknzr
    >>> tknzr = WsTknzr()
    >>> assert tknzr.tknz('a b c') == ['a', 'b', 'c']
    >>> assert tknzr.tknz('abc def') == ['abc', 'def']
    """
    # Perform normalization.
    txt = self.norm(txt)

    # First we split text using special token pattern.
    # Then we strip text to convert stand alone whitespace into empty string.
    # Finally we filter out empty string.
    return list(filter(bool, [tk.strip() for tk in SPLIT_PTTN.split(txt)]))

[docs]  def dtknz(self, tks: List[str]) -> str:
    """Join tokens with whitespaces.

    Insert whitespace between tokens.
    Returned text is normalized.

    Parameters
    ----------
    tks: list[str]
      Token list to be joint.

    Returns
    -------
    str
      Normalized text with whitespaces in between.

    See Also
    --------
    ~tknz
      Split text on whitespaces.
    ~norm
      Text normalization.

    Examples
    --------
    >>> from lmp.tknzr import WsTknzr
    >>> tknzr = WsTknzr()
    >>> assert tknzr.dtknz(['a', 'b', 'c']) == 'a b c'
    >>> assert tknzr.dtknz(['abc', 'def']) == 'abc def'
    """
    # First perform detokenization, then do normalization.
    # Order of these operation does not affect the output.
    return self.norm(' '.join(tks))