Source code for lmp.util.tknzr

"""Tokenizer utilities."""

import os
import pickle
from typing import Any, Final

import lmp.util.validate
import lmp.vars
from lmp.tknzr import TKNZR_OPTS, BaseTknzr

FILE_NAME: Final[str] = 'tknzr.pkl'


[docs]def create(tknzr_name: str, **kwargs: Any) -> BaseTknzr: """Create tokenizer instance by tokenizer's name. Tokenizer's arguments are collected in ``**kwargs`` and are passed directly to tokenizer's constructor. Parameters ---------- tknzr_name: str Name of the tokenizer to create. kwargs: typing.Any, optional Tokenizer's parameters. Returns ------- lmp.tknzr.BaseTknzr Tokenizer instance. See Also -------- :doc:`lmp.tknzr </tknzr/index>` All available tokenizers. Examples -------- >>> from lmp.tknzr import WsTknzr >>> import lmp.util.tknzr >>> tknzr = lmp.util.tknzr.create( ... is_uncased=False, ... max_vocab=10, ... min_count=2, ... tknzr_name=WsTknzr.tknzr_name, ... ) >>> assert isinstance(tknzr, WsTknzr) >>> assert not tknzr.is_uncased >>> assert tknzr.max_vocab == 10 >>> assert tknzr.min_count == 2 """ # `tknzr_name` validation. lmp.util.validate.raise_if_not_instance(val=tknzr_name, val_name='tknzr_name', val_type=str) lmp.util.validate.raise_if_not_in(val=tknzr_name, val_name='tknzr_name', val_range=list(TKNZR_OPTS.keys())) return TKNZR_OPTS[tknzr_name](**kwargs)
[docs]def load(exp_name: str) -> BaseTknzr: """Load pre-trained tokenizer from pickle file. Load pre-trained tokenizer from path ``project_root/exp/exp_name``. Parameters ---------- exp_name: str Pre-trained tokenizer experiment name. Returns ------- lmp.tknzr.BaseTknzr Pre-trained tokenizer instance. See Also -------- :doc:`lmp.tknzr </tknzr/index>` All available tokenizers. Examples -------- >>> from lmp.tknzr import WsTknzr >>> import lmp.util.tknzr >>> tknzr = lmp.util.tknzr.create( ... is_uncased=True, ... max_vocab=10, ... min_count=2, ... tknzr_name=WsTknzr.tknzr_name, ... ) >>> tknzr.save(exp_name='my_exp') >>> load_tknzr = lmp.util.tknzr.load(exp_name='my_exp') >>> assert isinstance(load_tknzr, WsTknzr) >>> assert load_tknzr.id2tk == tknzr.id2tk >>> assert load_tknzr.is_uncased == tknzr.is_uncased >>> assert load_tknzr.max_vocab == tknzr.max_vocab >>> assert load_tknzr.min_count == tknzr.min_count >>> assert load_tknzr.tk2id == tknzr.tk2id """ # `exp_name` validation. lmp.util.validate.raise_if_not_instance(val=exp_name, val_name='exp_name', val_type=str) lmp.util.validate.raise_if_empty_str(val=exp_name, val_name='exp_name') # `file_path` validation file_path = os.path.join(lmp.vars.EXP_PATH, exp_name, FILE_NAME) lmp.util.validate.raise_if_is_directory(path=file_path) # Load tokenizer from pickle. with open(file_path, 'rb') as f: tknzr = pickle.load(f) return tknzr
[docs]def save(exp_name: str, tknzr: BaseTknzr) -> None: """Save tokenizer as pickle file. .. danger:: This method overwrite existing files. Make sure you know what you are doing before calling this method. Parameters ---------- exp_name: int Tokenizer training experiment name. tknzr: lmp.model.BaseTknzr Tokenizer to be saved. Returns ------- None See Also -------- ~load Load pre-trained tokenizer instance by experiment name. Examples -------- >>> from lmp.tknzr import CharTknzr >>> import lmp.util.tknzr >>> tknzr = CharTknzr(is_uncased=False, max_vocab=10, min_count=2) >>> lmp.util.tknzr.save(exp_name='test', tknzr=tknzr) None """ # `exp_name` validation. lmp.util.validate.raise_if_not_instance(val=exp_name, val_name='exp_name', val_type=str) lmp.util.validate.raise_if_empty_str(val=exp_name, val_name='exp_name') # `dir_path` validation dir_path = os.path.join(lmp.vars.EXP_PATH, exp_name) lmp.util.validate.raise_if_is_file(path=dir_path) if not os.path.exists(dir_path): os.makedirs(dir_path) # `file_path` validation. file_path = os.path.join(dir_path, FILE_NAME) lmp.util.validate.raise_if_is_directory(path=file_path) # Save tokenizer as pickle file. with open(file_path, 'wb') as f: pickle.dump(tknzr, f)