Source code for lmp.script.tknz_txt
r"""Use pre-trained tokenizer to tokenize text.
One must first run the script :doc:`lmp.script.train_tknzr </script/train_tknzr>` before running this script.
The following example used pre-trained tokenizer under experiment ``my_tknzr_exp`` to tokenize text ``'Hello World'``.
.. code-block:: shell
python -m lmp.script.tknz_txt --exp_name my_tknzr_exp --txt "Hello World"
You can use ``-h`` or ``--help`` options to get a list of supported CLI arguments.
.. code-block:: shell
python -m lmp.script.tknz_txt -h
See Also
--------
:doc:`lmp.script.train_tknzr </script/train_tknzr>`
Train tokenizer.
:doc:`lmp.tknzr </tknzr/index>`
All available tokenizers.
"""
import argparse
import sys
from typing import List
import lmp.dset
import lmp.tknzr
import lmp.util.rand
import lmp.util.tknzr
import lmp.util.validate
[docs]def parse_args(argv: List[str]) -> argparse.Namespace:
"""Parse CLI arguments.
Parameters
----------
argv: list[str]
List of CLI arguments.
See Also
--------
sys.argv
Python CLI arguments interface.
Returns
-------
argparse.Namespace
Parsed CLI arguments.
"""
# Create parser.
parser = argparse.ArgumentParser(
'python -m lmp.script.tknz_txt',
description='Use pre-trained tokenizer to tokenize text.',
)
# Required arguments.
parser.add_argument(
'--exp_name',
default='my_tknzr_exp',
help='''
Pre-trained tokenizer experiment name.
Default is ``my_tknzr_exp``.
''',
type=str,
)
parser.add_argument(
'--seed',
default=42,
help='''
Random seed.
Default is ``42``.
''',
type=int,
)
parser.add_argument(
'--txt',
default='',
help='''
Text to be tokenized.
Default is empty string.
''',
type=str,
)
args = parser.parse_args(argv)
# `args.txt` validation.
lmp.util.validate.raise_if_not_instance(val=args.txt, val_name='args.txt', val_type=str)
return args
[docs]def main(argv: List[str]) -> List[str]:
"""Script entry point.
Parameters
----------
argv: list[str]
List of CLI arguments.
Returns
-------
None
"""
# Parse CLI arguments.
args = parse_args(argv=argv)
# Set random seed for reproducibility.
lmp.util.rand.set_seed(seed=args.seed)
# Load pre-trained tokenizer instance.
tknzr = lmp.util.tknzr.load(exp_name=args.exp_name)
# Tokenize text.
return tknzr.tknz(args.txt)
if __name__ == '__main__':
tks = main(argv=sys.argv[1:])
print(tks)