Source code for lmp.script.sample_dset
"""Use this script to sample data points of a dataset.
We can use the following script to sample text from :py:class:`~lmp.dset.WikiText2Dset`.
.. code-block:: shell
python -m lmp.script.sample_dset wiki-text-2
The default sampling index is ``0`` and the default version of :py:class:`~lmp.dset.WikiText2Dset` is ``train``.
Thus the following script has the same sampling result as above.
.. code-block:: shell
python -m lmp.script.sample_dset wiki-text-2 --idx 0 --ver train
The following script sample text from :py:class:`~lmp.dset.WikiText2Dset` with index set to ``1`` and version set to
``test``.
.. code-block:: shell
python -m lmp.script.sample_dset wiki-text-2 --idx 1 --ver test
You can use ``-h`` or ``--help`` options to get a list of available datasets.
.. code-block:: shell
python -m lmp.script.sample_dset -h
You can use ``-h`` or ``--help`` options on a specific dataset to get a list of supported CLI arguments, including all
available versions of a dataset.
.. code-block:: shell
python -m lmp.script.sample_dset wiki-text-2 -h
See Also
--------
:doc:`lmp.dset </dset/index>`
All available datasets.
"""
import argparse
import sys
from typing import List
import lmp.dset
import lmp.util.dset
import lmp.util.rand
import lmp.util.validate
[docs]def parse_args(argv: List[str]) -> argparse.Namespace:
"""Parse CLI arguments.
Parameters
----------
argv: list[str]
List of CLI arguments.
See Also
--------
sys.argv
Python CLI arguments interface.
Returns
-------
argparse.Namespace
Parsed CLI arguments.
"""
# Create parser.
parser = argparse.ArgumentParser('python -m lmp.script.sample_dset', description='Sample dataset.')
# Use dataset name to create subparser for all datasets.
subparsers = parser.add_subparsers(dest='dset_name', required=True)
for dset_name, dset_type in lmp.dset.DSET_OPTS.items():
dset_subparser = subparsers.add_parser(dset_name, description=f'Sample ``lmp.dset.{dset_type.__name__}``.')
dset_subparser.add_argument(
'--idx',
default=0,
help='''
Index of targeting sample.
Default is ``0``.
''',
type=int,
)
dset_subparser.add_argument(
'--seed',
default=42,
help='''
Random seed.
Default is ``42``.
''',
type=int,
)
dset_subparser.add_argument(
'--ver',
default=dset_type.df_ver,
help=f'''
Dataset version of ``lmp.dset.{dset_type.__name__}``.
Default version is ``{dset_type.df_ver}``.
''',
choices=dset_type.vers,
type=str,
)
args = parser.parse_args(argv)
# `args.idx` validation.
lmp.util.validate.raise_if_wrong_ordered(vals=[0, args.idx], val_names=['0', 'args.idx'])
return args
[docs]def main(argv: List[str]) -> None:
"""Script entry point.
Parameters
----------
argv: list[str]
List of CLI arguments.
Returns
-------
None
"""
# Parse CLI arguments.
args = parse_args(argv=argv)
# Set random seed for reproducibility.
lmp.util.rand.set_seed(seed=args.seed)
# Get dataset instance with specified version.
dset = lmp.util.dset.load(**args.__dict__)
# Output sample result.
print(dset[args.idx])
if __name__ == '__main__':
main(argv=sys.argv[1:])