Source code for lmp.model._lstm_2000

"""LSTM (2000 version) language model."""

import argparse
from typing import Any, ClassVar, Optional, Tuple

import torch
import torch.nn as nn

import lmp.util.metric
import lmp.util.validate
from lmp.model._lstm_1997 import LSTM1997, LSTM1997Layer
from lmp.tknzr._base import BaseTknzr


[docs]class LSTM2000(LSTM1997):
  r"""LSTM (2000 version) :footcite:`gers2000lstm` language model.

  - Let :math:`x` be batch of token ids with batch size :math:`B` and per sequence length :math:`S`.
  - Let :math:`V` be the vocabulary size of the paired tokenizer.
    Each token id represents an unique token, i.e., :math:`x_t \in \set{1, \dots, V}`.
  - Let :math:`E` be the token embedding lookup table.

    - Let :math:`\dEmb` be the dimension of token embeddings.
    - Let :math:`e_t` be the token embedding correspond to token id :math:`x_t`.
    - Token embeddings have dropout probability :math:`\pEmb`.

  - Let :math:`\nLyr` be the number of recurrent layers.
  - Let :math:`\dBlk` be the number of units in a memory cell block.
  - Let :math:`\nBlk` be the number of memory cell blocks.
  - Let :math:`\dHid = \nBlk \times \dBlk`.
  - Let :math:`h^\ell` be the hidden states of the :math:`\ell` th recurrent layer.

    - Let :math:`h_t^\ell` be the :math:`t` th time step of :math:`h^\ell`.
    - The initial hidden states :math:`h_0^\ell` are given as input.
    - Hidden states have dropout probability :math:`\pHid`.

  - Let :math:`c^\ell` be the memory cell internal states of the :math:`\ell` th recurrent layer.

    - let :math:`c_t^\ell` be the :math:`t` th time step of :math:`c^\ell`.
    - The memory cell initial internal states :math:`c_0^\ell` are given as input.

  LSTM (2000 version) language model is defined as follow:

  .. math::

    \begin{align*}
      & \algoProc{\LSTMZeroZero}\pa{x, \pa{\br{c_0^1, \dots, c_0^{\nLyr}}, \br{h_0^1, \dots, h_0^{\nLyr}}}} \\
      & \indent{1} \algoFor{t \in \set{1, \dots, S}}                                                        \\
      & \indent{2} e_t \algoEq (x_t)\text{-th row of } E \text{ but treated as column vector}               \\
      & \indent{2} \widehat{e_t} \algoEq \drop{e_t}{\pEmb}                                                  \\
      & \indent{2} h_t^0 \algoEq \tanh\pa{W_h \cdot \widehat{e_t} + b_h}                                    \\
      & \indent{1} \algoEndFor                                                                              \\
      & \indent{1} h^0 \algoEq \cat{h_1^0, \dots, h_S^0}                                                    \\
      & \indent{1} \widehat{h^0} \algoEq \drop{h^0}{\pHid}                                                  \\
      & \indent{1} \algoFor{\ell \in \set{1, \dots, \nLyr}}                                                 \\
      & \indent{2} \pa{c^\ell, h^\ell} \algoEq \LSTMZeroZeroLayer\pa{
                                                 x \algoEq \widehat{h^{\ell-1}},
                                                 c_0 \algoEq c_0^\ell,
                                                 h_0 \algoEq h_0^\ell
                                               }                                                            \\
      & \indent{2} \widehat{h^\ell} \algoEq \drop{h^\ell}{\pHid}                                            \\
      & \indent{1} \algoEndFor                                                                              \\
      & \indent{1} \algoFor{t \in \set{1, \dots, S}}                                                        \\
      & \indent{2} z_t \algoEq \tanh\pa{W_z \cdot h_t^{\nLyr} + b_z}                                        \\
      & \indent{2} \widehat{z_t} \algoEq \drop{z_t}{\pHid}                                                  \\
      & \indent{2} y_t \algoEq \sof{E \cdot \widehat{z_t}}                                                  \\
      & \indent{1} \algoEndFor                                                                              \\
      & \indent{1} y \algoEq \cat{y_1, \dots, y_S}                                                          \\
      & \indent{1} \algoReturn \pa{y, \pa{\br{c_S^1, \dots, c_S^{\nLyr}}, \br{h_S^1, \dots, h_S^{\nLyr}}}}  \\
      & \algoEndProc
    \end{align*}

  +-------------------------------------------+---------------------------------------------------------+
  | Trainable Parameters                      | Nodes                                                   |
  +------------------+------------------------+--------------------------+------------------------------+
  | Parameter        | Shape                  | Symbol                   | Shape                        |
  +==================+========================+==========================+==============================+
  | :math:`E`        | :math:`(V, \dEmb)`     | :math:`c^\ell`           | :math:`(B, S, \nBlk, \dBlk)` |
  +------------------+------------------------+--------------------------+------------------------------+
  | :math:`W_h`      | :math:`(\dHid, \dEmb)` | :math:`c_t^\ell`         | :math:`(B, \nBlk, \dBlk)`    |
  +------------------+------------------------+--------------------------+------------------------------+
  | :math:`W_z`      | :math:`(\dEmb, \dHid)` | :math:`e_t`              | :math:`(B, S, \dEmb)`        |
  +------------------+------------------------+--------------------------+------------------------------+
  | :math:`b_h`      | :math:`(\dHid)`        | :math:`h^\ell`           | :math:`(B, S, \dHid)`        |
  +------------------+------------------------+--------------------------+------------------------------+
  | :math:`b_z`      | :math:`(\dEmb)`        | :math:`h_t^\ell`         | :math:`(B, \dHid)`           |
  +------------------+------------------------+--------------------------+------------------------------+
  | :math:`\LSTMZeroZeroLayer`                | :math:`\widehat{h^\ell}` | :math:`(B, \dHid)`           |
  +------------------+------------------------+--------------------------+------------------------------+
  |                                           | :math:`x`                | :math:`(B, S)`               |
  |                                           +--------------------------+------------------------------+
  |                                           | :math:`x_t`              | :math:`(B)`                  |
  |                                           +--------------------------+------------------------------+
  |                                           | :math:`y`                | :math:`(B, S, V)`            |
  |                                           +--------------------------+------------------------------+
  |                                           | :math:`y_t`              | :math:`(B, V)`               |
  |                                           +--------------------------+------------------------------+
  |                                           | :math:`z_t`              | :math:`(B, \dEmb)`           |
  |                                           +--------------------------+------------------------------+
  |                                           | :math:`\widehat{z_t}`    | :math:`(B, \dEmb)`           |
  +-------------------------------------------+--------------------------+------------------------------+

  - The only differences between :py:class:`~lmp.model.LSTM1997` and :py:class:`~LSTM2000` are the underlying layers
    :py:class:`~lmp.model.LSTM1997Layer` and :py:class:`~LSTM2000Layer`.
    All other symbols are calculated as in :py:class:`lmp.model.LSTM1997`.
  - Forget gate biases are initialized with uniform distribution :math:`\mathcal{U}(0, \init_{fb})`.
    This make forget gate remain open at the start of training.

  Parameters
  ----------
  d_blk: int, default: 1
    Number of units in a memory cell block :math:`\dBlk`.
  d_emb: int, default: 1
    Token embedding dimension :math:`\dEmb`.
  init_ib: float, default: 1.0
    Uniform distribution upper bound :math:`\init_{fb}` used to initialize forget gate biases.
  init_ib: float, default: -1.0
    Uniform distribution lower bound :math:`\init_{ib}` used to initialize input gate biases.
  init_lower: float, default: -0.1
    Uniform distribution lower bound :math:`\init_l` used to initialize model parameters.
  init_ob: float, default: -1.0
    Uniform distribution lower bound :math:`\init_{ob}` used to initialize output gate biases.
  init_upper: float, default: 0.1
    Uniform distribution upper bound :math:`\init_u` used to initialize model parameters.
  kwargs: typing.Any, optional
    Useless parameter.
    Intently left for subclasses inheritance.
  label_smoothing: float, default: 0.0
    Smoothing applied on prediction target :math:`x_{t+1}`.
  n_blk: int, default: 1
    Number of memory cell blocks :math:`\nBlk`.
  n_lyr: int, default: 1
    Number of recurrent layers :math:`\nLyr`.
  p_emb: float, default: 0.0
    Embeddings dropout probability :math:`\pEmb`.
  p_hid: float, default: 0.0
    Hidden units dropout probability :math:`\pHid`.
  tknzr: ~lmp.tknzr.BaseTknzr
    Tokenizer instance.

  Attributes
  ----------
  d_blk: int
    Number of units in a memory cell block :math:`\dBlk`.
  d_hid: int
    Total number of memory cell units :math:`\dHid`.
  emb: torch.nn.Embedding
    Token embedding lookup table :math:`E`.
    Input shape: :math:`(B, S)`.
    Output shape: :math:`(B, S, \dEmb)`.
  fc_e2h: torch.nn.Sequential
    Fully connected layer :math:`W_h` and :math:`b_h` which connects input
    units to the 1st recurrent layer's input.
    Dropout with probability :math:`\pEmb` is applied to input.
    Dropout with probability :math:`\pHid` is applied to output.
    Input shape: :math:`(B, S, \dEmb)`.
    Output shape: :math:`(B, S, \dHid)`.
  fc_h2e: torch.nn.Sequential
    Fully connected layer :math:`W_z` and :math:`b_z` which transforms hidden states to next token embeddings.
    Dropout with probability :math:`\pHid` is applied to output.
    Input shape: :math:`(B, S, \dHid)`.
    Output shape: :math:`(B, S, \dEmb)`.
  init_fb: float
    Uniform distribution upper bound :math:`\init_{fb}` used to initialize forget gate biases.
  init_ib: float
    Uniform distribution lower bound :math:`\init_{ib}` used to initialize input gate biases.
  init_lower: float
    Uniform distribution lower bound :math:`\init_l` used to initialize model parameters.
  init_ob: float
    Uniform distribution lower bound :math:`\init_{ob}` used to initialize output gate biases.
  init_upper: float
    Uniform distribution upper bound :math:`\init_u` used to initialize model parameters.
  label_smoothing: float
    Smoothing applied on prediction target :math:`x_{t+1}`.
  model_name: ClassVar[str]
    CLI name of LSTM (2000 version) is ``LSTM-2000``.
  n_blk: int
    Number of memory cell blocks :math:`\nBlk`.
  n_lyr: int
    Number of recurrent layers :math:`\nLyr`.
  p_emb: float
    Embeddings dropout probability :math:`\pEmb`.
  p_hid: float
    Hidden units dropout probability :math:`\pHid`.
  stack_rnn: torch.nn.ModuleList
    :py:class:`~LSTM2000Layer` stacking layers.
    Each LSTM (2000 version) layer is followed by a dropout layer with probability :math:`\pHid`.
    The number of stacking layers is equal to :math:`2 \nLyr`.
    Input shape: :math:`(B, S, \dHid)`.
    Output shape: :math:`(B, S, \dHid)`.

  See Also
  --------
  ~lmp.model.LSTM1997
    LSTM (1997 version) language model.
  ~lmp.model.LSTM1997Layer
    LSTM (1997 version) recurrent neural network.
  ~LSTM2000Layer
    LSTM (2000 version) recurrent neural network.
  """

  model_name: ClassVar[str] = 'LSTM-2000'

  def __init__(
    self,
    *,
    d_blk: int = 1,
    d_emb: int = 1,
    init_fb: float = 1.0,
    init_ib: float = -1.0,
    init_lower: float = -0.1,
    init_ob: float = -1.0,
    init_upper: float = 0.1,
    label_smoothing: float = 0.0,
    n_blk: int = 1,
    n_lyr: int = 1,
    p_emb: float = 0.0,
    p_hid: float = 0.0,
    tknzr: BaseTknzr,
    **kwargs: Any,
  ):
    super().__init__(
      d_blk=d_blk,
      d_emb=d_emb,
      init_ib=init_ib,
      init_lower=init_lower,
      init_ob=init_ob,
      init_upper=init_upper,
      label_smoothing=label_smoothing,
      n_blk=n_blk,
      n_lyr=n_lyr,
      p_emb=p_emb,
      p_hid=p_hid,
      tknzr=tknzr,
      **kwargs,
    )

    # `init_fb` validation.
    lmp.util.validate.raise_if_not_instance(val=init_fb, val_name='init_fb', val_type=float)
    lmp.util.validate.raise_if_wrong_ordered(vals=[0, init_fb], val_names=['0', 'init_fb'])
    self.init_fb = init_fb

    # Stacking LSTM (2000 version) layers.
    # Each RNN layer is followed by one dropout layer.
    self.stack_rnn = nn.ModuleList([])
    for _ in range(n_lyr):
      self.stack_rnn.append(
        LSTM2000Layer(
          d_blk=d_blk,
          in_feat=self.d_hid,
          init_fb=init_fb,
          init_lower=init_lower,
          init_ob=init_ob,
          init_upper=init_upper,
          n_blk=n_blk,
        )
      )
      self.stack_rnn.append(nn.Dropout(p=p_hid))

[docs]  @classmethod
  def add_CLI_args(cls, parser: argparse.ArgumentParser) -> None:
    """Add LSTM (2000 version) language model hyperparameters to CLI argument parser.

    Parameters
    ----------
    parser: argparse.ArgumentParser
      CLI argument parser.

    Returns
    -------
    None

    See Also
    --------
    :doc:`lmp.script.train_model </script/train_model>`
      Language model training script.

    Examples
    --------
    >>> import argparse
    >>> import math
    >>> from lmp.model import LSTM2000
    >>> parser = argparse.ArgumentParser()
    >>> LSTM2000.add_CLI_args(parser)
    >>> args = parser.parse_args([
    ...   '--d_blk', '64',
    ...   '--d_emb', '100',
    ...   '--init_fb', '0.1',
    ...   '--init_ib', '-0.1',
    ...   '--init_lower', '-0.01',
    ...   '--init_ob', '-0.1',
    ...   '--init_upper', '0.01',
    ...   '--label_smoothing', '0.1',
    ...   '--n_blk', '8',
    ...   '--n_lyr', '2',
    ...   '--p_emb', '0.5',
    ...   '--p_hid', '0.1',
    ... ])
    >>> assert args.d_blk == 64
    >>> assert args.d_emb == 100
    >>> assert math.isclose(args.init_fb, 0.1)
    >>> assert math.isclose(args.init_ib, -0.1)
    >>> assert math.isclose(args.init_lower, -0.01)
    >>> assert math.isclose(args.init_ob, -0.1)
    >>> assert math.isclose(args.init_upper, 0.01)
    >>> assert math.isclose(args.label_smoothing, 0.1)
    >>> assert args.n_blk == 8
    >>> assert args.n_lyr == 2
    >>> assert math.isclose(args.p_emb, 0.5)
    >>> assert math.isclose(args.p_hid, 0.1)
    """
    # `parser` validation.
    lmp.util.validate.raise_if_not_instance(val=parser, val_name='parser', val_type=argparse.ArgumentParser)

    # Add hyperparameters to CLI arguments.
    group = parser.add_argument_group('LSTM (2000 version) language model hyperparameters')
    group.add_argument(
      '--d_blk',
      default=1,
      help='''
      Dimension of each memory cell block.
      Default is ``1``.
      ''',
      type=int,
    )
    group.add_argument(
      '--d_emb',
      default=1,
      help='''
      Token embedding dimension.
      Default is ``1``.
      ''',
      type=int,
    )
    group.add_argument(
      '--init_fb',
      default=1.0,
      help='''
      Uniform distribution upper bound used to initialize forget gate biases.
      Default is ``1.0``.
      ''',
      type=float,
    )
    group.add_argument(
      '--init_ib',
      default=-1.0,
      help='''
      Uniform distribution lower bound used to initialize input gate biases.
      Default is ``-1.0``.
      ''',
      type=float,
    )
    group.add_argument(
      '--init_lower',
      default=-0.1,
      help='''
      Uniform distribution lower bound used to initialize model parameters.
      Default is ``-0.1``.
      ''',
      type=float,
    )
    group.add_argument(
      '--init_ob',
      default=-1.0,
      help='''
      Uniform distribution lower bound used to initialize output gate biases.
      Default is ``-1.0``.
      ''',
      type=float,
    )
    group.add_argument(
      '--init_upper',
      default=0.1,
      help='''
      Uniform distribution lower bound used to initialize model parameters.
      Default is ``0.1``.
      ''',
      type=float,
    )
    group.add_argument(
      '--label_smoothing',
      default=0.0,
      help='''
      Label smoothing applied on cross entropy loss.
      Default is ``0.0``.
      ''',
      type=float,
    )
    group.add_argument(
      '--n_blk',
      default=1,
      help='''
      Number of memory cell blocks.
      Default is ``1``.
      ''',
      type=int,
    )
    group.add_argument(
      '--n_lyr',
      default=1,
      help='''
      Number of LSTM (2000 version) layers.
      Default is ``1``.
      ''',
      type=int,
    )
    group.add_argument(
      '--p_emb',
      default=0.0,
      help='''
      Embeddings dropout probability.
      Default is ``0.0``.
      ''',
      type=float,
    )
    group.add_argument(
      '--p_hid',
      default=0.0,
      help='''
      Hidden units dropout probability.
      Default is ``0.0``.
      ''',
      type=float,
    )


[docs]class LSTM2000Layer(LSTM1997Layer):
  r"""LSTM (2000 version) :footcite:`gers2000lstm` recurrent neural network.

  - Let :math:`\hIn` be the number of input features per time step.
  - Let :math:`\dBlk` be the number of units in a memory cell block.
  - Let :math:`\nBlk` be the number of memory cell blocks.
  - Let :math:`\hOut = \nBlk \times \dBlk` be the number of output features per time step.
  - Let :math:`x` be a batch of sequences of input features with shape :math:`(B, S, \hIn)`, where :math:`B` is batch
    size and :math:`S` is per sequence length.
  - Let :math:`h_0` be the initial hidden states with shape :math:`(B, \hOut)`.
  - Let :math:`c_0` be the memory cell initial internal states with shape :math:`(B, \nBlk, \dBlk)`.

  LSTM (2000 version) layer is defined as follow:

  .. math::

    \begin{align*}
      & \algoProc{\LSTMZeroZeroLayer}\pa{x, c_0, h_0}                                                   \\
      & \indent{1} S \algoEq x.\sz{1}                                                                   \\
      & \indent{1} \algoFor{t \in \set{1, \dots, S}}                                                    \\
      & \indent{2} f_t \algoEq \sigma(W_f \cdot x_t + U_f \cdot h_{t-1} + b_f)       &&\tag{1}\label{1} \\
      & \indent{2} i_t \algoEq \sigma(W_i \cdot x_t + U_i \cdot h_{t-1} + b_i)                          \\
      & \indent{2} o_t \algoEq \sigma(W_o \cdot x_t + U_o \cdot h_{t-1} + b_o)                          \\
      & \indent{2} \algoFor{k \in \set{1, \dots, \nBlk}}                                                \\
      & \indent{3} g_{t,k} \algoEq \tanh\pa{W_k \cdot x_t + U_k \cdot h_{t-1} + b_k}                    \\
      & \indent{3} c_{t,k} \algoEq f_{t, k} \cdot c_{t-1,k} + i_{t,k} \cdot g_{t,k}  &&\tag{2}\label{2} \\
      & \indent{3} h_{t,k} \algoEq o_{t,k} \cdot \tanh\pa{c_{t,k}}                                      \\
      & \indent{2} \algoEndFor                                                                          \\
      & \indent{2} c_t \algoEq \cat{c_{t,1}, \dots, c_{t,\nBlk}}                                        \\
      & \indent{2} h_t \algoEq \fla{h_{t,1}, \dots, h_{t,\nBlk}}                                        \\
      & \indent{1} \algoEndFor                                                                          \\
      & \indent{1} c \algoEq \cat{c_1, \dots, c_S}                                                      \\
      & \indent{1} h \algoEq \cat{h_1, \dots, h_S}                                                      \\
      & \indent{1} \algoReturn (c, h)                                                                   \\
      & \algoEndProc
    \end{align*}

  +--------------------------------------+------------------------------------------------+
  | Trainable Parameters                 | Nodes                                          |
  +-------------+------------------------+-----------------+------------------------------+
  | Parameter   | Shape                  | Symbol          | Shape                        |
  +=============+========================+=================+==============================+
  | :math:`U_f` | :math:`(\nBlk, \hOut)` | :math:`c`       | :math:`(B, S, \nBlk, \dBlk)` |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`U_i` | :math:`(\nBlk, \hOut)` | :math:`c_t`     | :math:`(B, \nBlk, \dBlk)`    |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`U_k` | :math:`(\dBlk, \hOut)` | :math:`c_{t,k}` | :math:`(B, \dBlk)`           |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`U_o` | :math:`(\nBlk, \hOut)` | :math:`f_t`     | :math:`(B, \nBlk)`           |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`W_f` | :math:`(\nBlk, \hIn)`  | :math:`f_{t,k}` | :math:`(B, 1)`               |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`W_i` | :math:`(\nBlk, \hIn)`  | :math:`g_{t,k}` | :math:`(B, \dBlk)`           |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`W_k` | :math:`(\dBlk, \hIn)`  | :math:`h`       | :math:`(B, S, \hOut)`        |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`W_o` | :math:`(\nBlk, \hIn)`  | :math:`h_t`     | :math:`(B, \hOut)`           |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`b_f` | :math:`(\nBlk)`        | :math:`h_{t,k}` | :math:`(B, \dBlk)`           |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`b_i` | :math:`(\nBlk)`        | :math:`i_t`     | :math:`(B, \nBlk)`           |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`b_k` | :math:`(\dBlk)`        | :math:`i_{t,k}` | :math:`(B, 1)`               |
  +-------------+------------------------+-----------------+------------------------------+
  | :math:`b_o` | :math:`(\nBlk)`        | :math:`o_t`     | :math:`(B, \nBlk)`           |
  +-------------+------------------------+-----------------+------------------------------+
  |                                      | :math:`o_{t,k}` | :math:`(B, 1)`               |
  |                                      +-----------------+------------------------------+
  |                                      | :math:`x`       | :math:`(B, S, \hIn)`         |
  |                                      +-----------------+------------------------------+
  |                                      | :math:`x_t`     | :math:`(B, \hIn)`            |
  +--------------------------------------+-----------------+------------------------------+

  - The only differences between :py:class:`~lmp.model.LSTM1997Layer` and :py:class:`~LSTM2000Layer` are equations
    :math:`\eqref{1}\eqref{2}`.
    All other symbols are calculated as in :py:class:`~lmp.model.LSTM1997Layer`.
  - :math:`f_t` is memory cell blocks' forget gate units at time step :math:`t`.
    :math:`f_{t,k}` is the :math:`k`-th coordinates of :math:`f_t`, which represents the :math:`k`-th memory cell
    block's forget gate unit at time step :math:`t`.
  - Model parameters in LSTM (2000 version) layer are initialized as in :py:class:`~lmp.model.LSTM1997Layer`.
  - Forget gate biases are initialized with uniform distribution :math:`\mathcal{U}(0, \init_{fb})`.
    The upper bound :math:`\init_{fb}` is given as hyperparameter.
    This make forget gate remain open at the start of training.

  Parameters
  ----------
  d_blk: int, default: 1
    Dimension of each memory cell block :math:`\dBlk`.
  in_feat: int, default: 1
    Number of input features per time step :math:`\hIn`.
  init_fb: float, default: 1.0
    Uniform distribution upper bound :math:`\init_{fb}` used to initialize forget gate biases.
  init_ib: float, default: -1.0
    Uniform distribution lower bound :math:`\init_{ib}` used to initialize input gate biases.
  init_lower: float, default: -0.1
    Uniform distribution lower bound :math:`\init_l` used to initialize model parameters.
  init_ob: float, default: -1.0
    Uniform distribution lower bound :math:`\init_{ob}` used to initialize output gate biases.
  init_upper: float, default: 0.1
    Uniform distribution upper bound :math:`\init_u` used to initialize model parameters.
  n_blk: int, default: 1
    Number of memory cell blocks :math:`\nBlk`.
  kwargs: typing.Any, optional
    Useless parameter.
    Intently left for subclasses inheritance.

  Attributes
  ----------
  c_0: torch.Tensor
    Memory cell blocks' initial internal states :math:`c_0`.
    Shape: :math:`(1, \nBlk, \dBlk)`.
  d_blk: int
    Number of units in a memory cell block :math:`\dBlk`.
  d_hid: int
    Total number of memory cell units :math:`\hOut`.
  fc_h2fg: torch.nn.Linear
    Fully connected layer :math:`U_f` which connects hidden states to memory cell's forget gate units.
    Input shape: :math:`(B, \dHid)`.
    Output shape: :math:`(B, \nBlk)`.
  fc_h2ig: torch.nn.Linear
    Fully connected layer :math:`U_i` which connects hidden states to memory cell's input gate units.
    Input shape: :math:`(B, \dHid)`.
    Output shape: :math:`(B, \nBlk)`.
  fc_h2mc_in: torch.nn.Linear
    Fully connected layers :math:`\pa{U_1, \dots, U_{\nBlk}}` which connect hidden states to memory cell
    blocks' input activations.
    Input shape: :math:`(B, \dHid)`.
    Output shape: :math:`(B, \dHid)`.
  fc_h2og: torch.nn.Linear
    Fully connected layer :math:`U_o` which connects hidden states to memory cell's output gate units.
    Input shape: :math:`(B, \dHid)`.
    Output shape: :math:`(B, \nBlk)`.
  fc_x2fg: torch.nn.Linear
    Fully connected layer :math:`W_f` and :math:`b_f` which connects input units to memory cell's forget gate units.
    Input shape: :math:`(B, S, \hIn)`.
    Output shape: :math:`(B, S, \nBlk)`.
  fc_x2ig: torch.nn.Linear
    Fully connected layer :math:`W_i` and :math:`b_i` which connects input units to memory cell's input gate units.
    Input shape: :math:`(B, S, \hIn)`.
    Output shape: :math:`(B, S, \nBlk)`.
  fc_x2mc_in: torch.nn.Linear
    Fully connected layers :math:`\pa{W_1, \dots, W_{\nBlk}}` and :math:`\pa{b_1, \dots, b_{\nBlk}}` which connects
    input units to memory cell blocks' input activations.
    Input shape: :math:`(B, S, \hIn)`.
    Output shape: :math:`(B, S, \dHid)`.
  fc_x2og: torch.nn.Linear
    Fully connected layer :math:`W_o` and :math:`b_o` which connects input units to memory cell's output gate units.
    Input shape: :math:`(B, S, \hIn)`.
    Output shape: :math:`(B, S, \nBlk)`.
  h_0: torch.Tensor
    Initial hidden states :math:`h_0`.
    Shape: :math:`(1, \dHid)`
  in_feat: int
    Number of input features per time step :math:`\hIn`.
  init_fb: float
    Uniform distribution upper bound :math:`\init_{fb}` used to initialize forget gate biases.
  init_ib: float
    Uniform distribution lower bound :math:`\init_{ib}` used to initialize input gate biases.
  init_lower: float
    Uniform distribution lower bound :math:`\init_l` used to initialize model parameters.
  init_ob: float
    Uniform distribution lower bound :math:`\init_{ob}` used to initialize output gate biases.
  init_upper: float
    Uniform distribution upper bound :math:`\init_u` used to initialize model parameters.
  n_blk: int
    Number of memory cell blocks :math:`\nBlk`.

  See Also
  --------
  ~lmp.model.LSTM1997Layer
    LSTM (1997 version) recurrent neural network.
  """

  def __init__(
    self,
    *,
    d_blk: int = 1,
    in_feat: int = 1,
    init_fb: float = 1.0,
    init_ib: float = -1.0,
    init_lower: float = -0.1,
    init_ob: float = -1.0,
    init_upper: float = 0.1,
    n_blk: int = 1,
    **kwargs: Any,
  ):
    super().__init__(
      d_blk=d_blk,
      in_feat=in_feat,
      init_ib=init_ib,
      init_lower=init_lower,
      init_ob=init_ob,
      init_upper=init_upper,
      n_blk=n_blk,
      **kwargs,
    )

    # `init_fb` validation.
    lmp.util.validate.raise_if_not_instance(val=init_fb, val_name='init_fb', val_type=float)
    lmp.util.validate.raise_if_wrong_ordered(vals=[0, init_fb], val_names=['0', 'init_fb'])
    self.init_fb = init_fb

    # Fully connected layer which connects input units to forget gate units.
    self.fc_x2fg = nn.Linear(in_features=in_feat, out_features=n_blk)

    # Fully connected layer which connects hidden states to forget gate units.
    # Set `bias=False` to share bias term with `self.fc_x2fg` layer.
    self.fc_h2fg = nn.Linear(in_features=self.d_hid, out_features=n_blk, bias=False)

[docs]  def forward(
    self,
    x: torch.Tensor,
    c_0: Optional[torch.Tensor] = None,
    h_0: Optional[torch.Tensor] = None,
  ) -> Tuple[torch.Tensor, torch.Tensor]:
    r"""Calculate batch of hidden states for ``x``.

    Below we describe the forward pass algorithm of LSTM (2000 version) layer.

    #. Let ``x`` be a batch of sequences of input features :math:`x`.
    #. Let ``x.size(1)`` be sequence length :math:`S`.
    #. Let ``c_0`` be the memory cell initial internal states :math:`c_0`.
       If ``c_0 is None``, use ``self.c_0`` instead.
    #. Let ``h_0`` be the initial hidden states :math:`h_0`.
       If ``h_0 is None``, use ``self.h_0`` instead.
    #. Loop through :math:`\set{1, \dots, S}` with looping index :math:`t`.

       #. Use :math:`x_t`, :math:`h_{t-1}`, ``self.fc_x2fg`` and ``self.fc_h2fg`` to get forget gate units :math:`f_t`.
       #. Use :math:`x_t`, :math:`h_{t-1}`, ``self.fc_x2ig`` and ``self.fc_h2ig`` to get input gate units :math:`i_t`.
       #. Use :math:`x_t`, :math:`h_{t-1}`, ``self.fc_x2og`` and ``self.fc_h2og`` to get output gate units :math:`o_t`.
       #. Use :math:`x_t`, :math:`h_{t-1}`, ``self.fc_x2mc_in`` and ``self.fc_h2mc_in`` to get memory cell input
          activations :math:`g_{t,1}, \dots, g_{t,\nBlk}`.
       #. Derive memory cell new internal state :math:`c_{t,1}, \dots, c_{t,\nBlk}` using forget gates units
          :math:`f_{t,1}, \dots, f_{t,\nBlk}`, input gate units :math:`i_{t,1}, \dots, i_{t,\nBlk}`, memory cell old
          internal states :math:`c_{t-1,1}, \dots, c_{t-1,\nBlk}` and memory cell input activations
          :math:`g_{t,1}, \dots, g_{t,\nBlk}`.
       #. Derive new hidden states :math:`h_t` using output gate units :math:`o_{t,1}, \dots, o_{t,\nBlk}` and memory
          cell new internal states :math:`c_{t,1}, \dots, c_{t,\nBlk}`.

    #. Denote the concatenation of memory cell internal states :math:`c_1, \dots, c_S` as :math:`c`.
    #. Denote the concatenation of hidden states :math:`h_1, \dots, h_S` as :math:`h`.
    #. Return :math:`(c, h)`.

    Parameters
    ----------
    x: torch.Tensor
      Batch of sequences of input features.
      ``x`` has shape :math:`(B, S, \hIn)` and ``dtype == torch.float``.
    c_0: typing.Optional[torch.Tensor], default: None
      Batch of memory cell previous internal states.
      The tensor has shape :math:`(B, \nBlk, \dBlk)` and ``dtype == torch.float``.
      Set to ``None`` to use the initial memory internal state ``self.c_0``.
    h_0: typing.Optional[torch.Tensor], default: None
      Batch of previous hidden states.
      The tensor has shape :math:`(B, \hOut)` and ``dtype == torch.float``.
      Set to ``None`` to use the initial hidden states ``self.h_0``.

    Returns
    -------
    tuple[torch.Tensor, torch.Tensor]
      The first tensor is batch of memory cell internal states and the second tensor is batch of hidden states.
      Batch memory cell internal states has shape :math:`(B, S, \nBlk, \dBlk)` and ``dtype == torch.float``.
      Batch hidden states has shape :math:`(B, S, \hOut)` and ``dtype == torch.float``.
    """
    if c_0 is None:
      c_prev = self.c_0
    else:
      c_prev = c_0

    if h_0 is None:
      h_prev = self.h_0
    else:
      h_prev = h_0

    # Sequence length.
    S = x.size(1)

    # Transform input features to gate units.
    # Shape: (B, S, n_blk).
    x2fg = self.fc_x2fg(x)
    x2ig = self.fc_x2ig(x)
    x2og = self.fc_x2og(x)

    # Transform input features to memory cell block's input.
    # Shape: (B, S, d_hid).
    x2mc_in = self.fc_x2mc_in(x)

    # Perform recurrent calculation for `S` steps.
    c_all = []
    h_all = []
    for t in range(S):
      # Get forget / input / output gate units and unsqueeze to separate memory cell blocks.
      # Shape: (B, n_blk, 1).
      fg = torch.sigmoid(x2fg[:, t, :] + self.fc_h2fg(h_prev)).unsqueeze(-1)
      ig = torch.sigmoid(x2ig[:, t, :] + self.fc_h2ig(h_prev)).unsqueeze(-1)
      og = torch.sigmoid(x2og[:, t, :] + self.fc_h2og(h_prev)).unsqueeze(-1)

      # Calculate memory cell blocks input activation and reshape to separate memory cell blocks.
      # Shape: (B, n_blk, d_blk).
      mc_in = torch.tanh(x2mc_in[:, t, :] + self.fc_h2mc_in(h_prev)).reshape(-1, self.n_blk, self.d_blk)

      # Calculate memory cell new internal states.
      # Shape: (B, n_blk, d_blk).
      c_cur = fg * c_prev + ig * mc_in

      # Calculate memory cell outputs and flatten to form the new hidden states.
      # Shape: (B, d_hid).
      h_cur = (og * torch.tanh(c_cur)).reshape(-1, self.d_hid)

      c_all.append(c_cur)
      h_all.append(h_cur)

      # Update hidden states and memory cell internal states.
      c_prev = c_cur
      h_prev = h_cur

    # Stack list of tensors into single tensor.
    # In  shape: list of (B, d_hid) with length equals to `S`.
    # Out shape: (B, S, d_hid).
    h = torch.stack(h_all, dim=1)

    # Stack list of tensors into single tensor.
    # In  shape: list of (B, n_blk, d_blk) with length equals to `S`.
    # Out shape: (B, S, n_blk, d_blk).
    c = torch.stack(c_all, dim=1)

    return (c, h)

[docs]  def params_init(self) -> None:
    r"""Initialize model parameters.

    All weights and biases other than :math:`b_f, b_i, b_o` are initialized with uniform distribution
    :math:`\mathcal{U}\pa{\init_l, \init_u}`.
    :math:`b_f` is initialized with uniform distribution :math:`\mathcal{U}\pa{0, \init_{fb}}`.
    :math:`b_i` is initialized with uniform distribution :math:`\mathcal{U}\pa{\init_{ib}, 0}`.
    :math:`b_o` is initialized with uniform distribution :math:`\mathcal{U}\pa{\init_{ob}, 0}`.
    :math:`b_f` is initialized separatedly so that forget gates remain open at the start of training.
    :math:`b_i, b_o` are initialized separatedly so that input and output gates remain closed at the start of training.

    Returns
    -------
    None

    See Also
    --------
    ~lmp.model.LSTM1997Layer.params_init
      LSTM (1997 version) layer parameter initialization.
    """
    super().params_init()

    # Initialize weights and biases with uniform distribution.
    nn.init.uniform_(self.fc_x2fg.weight, self.init_lower, self.init_upper)
    nn.init.uniform_(self.fc_h2fg.weight, self.init_lower, self.init_upper)

    # Forget gate units' biases are initialized to positive values.
    nn.init.uniform_(self.fc_x2fg.bias, 0.0, self.init_fb)