Source code for lmp.dset._ch_poem

"""Chinese poetry dataset."""

import io
import os
import zipfile
from typing import ClassVar, List, Optional

# Typeshed for `pandas` is under development, we will ignore type check on `pandas` until `pandas` typeshed finish its
# development and release stable version.
import pandas as pd  # type: ignore

import lmp.vars
from lmp.dset._base import BaseDset


[docs]class ChPoemDset(BaseDset): """Poems of ancient Chinese dynasty. See https://github.com/Werneror/Poetry for details on dataset. See https://github.com/ProFatXuanAll/demo-dataset for dataset preprocessing details. Here we list some dataset statistics. +--------------------+-----------------+-------------------+ | dynasty | number of poems | number of authors | +====================+=================+===================+ | ``宋`` | 287114 | 9446 | +--------------------+-----------------+-------------------+ | ``明`` | 236957 | 4439 | +--------------------+-----------------+-------------------+ | ``清`` | 90089 | 8872 | +--------------------+-----------------+-------------------+ | ``唐`` | 49195 | 2736 | +--------------------+-----------------+-------------------+ | ``元`` | 37375 | 1209 | +--------------------+-----------------+-------------------+ | ``近現代`` | 28419 | 790 | +--------------------+-----------------+-------------------+ | ``當代`` | 28219 | 177 | +--------------------+-----------------+-------------------+ | ``明末清初`` | 17700 | 176 | +--------------------+-----------------+-------------------+ | ``元末明初`` | 15736 | 79 | +--------------------+-----------------+-------------------+ | ``清末民國初`` | 15367 | 99 | +--------------------+-----------------+-------------------+ | ``清末近現代初`` | 12464 | 48 | +--------------------+-----------------+-------------------+ | ``宋末元初`` | 12058 | 41 | +--------------------+-----------------+-------------------+ | ``南北朝`` | 4586 | 434 | +--------------------+-----------------+-------------------+ | ``近現代末當代初`` | 3426 | 23 | +--------------------+-----------------+-------------------+ | ``魏晉`` | 3020 | 251 | +--------------------+-----------------+-------------------+ | ``金末元初`` | 3019 | 17 | +--------------------+-----------------+-------------------+ | ``金`` | 2741 | 253 | +--------------------+-----------------+-------------------+ | ``民國末當代初`` | 1948 | 9 | +--------------------+-----------------+-------------------+ | ``隋`` | 1170 | 84 | +--------------------+-----------------+-------------------+ | ``唐末宋初`` | 1118 | 44 | +--------------------+-----------------+-------------------+ | ``先秦`` | 570 | 8 | +--------------------+-----------------+-------------------+ | ``隋末唐初`` | 472 | 40 | +--------------------+-----------------+-------------------+ | ``漢`` | 363 | 83 | +--------------------+-----------------+-------------------+ | ``宋末金初`` | 234 | 9 | +--------------------+-----------------+-------------------+ | ``遼`` | 22 | 7 | +--------------------+-----------------+-------------------+ | ``秦`` | 2 | 2 | +--------------------+-----------------+-------------------+ | ``魏晉末南北朝初`` | 1 | 1 | +--------------------+-----------------+-------------------+ | total | 853385 | 29377 | +--------------------+-----------------+-------------------+ Parameters ---------- ver: Optional[str], default: None Version of the dataset. Set to ``None`` to use the default version ``self.__class__.df_ver``. Attributes ---------- df_ver: typing.ClassVar[str] Default version is ``'唐'``. dset_name: typing.ClassVar[str] CLI name of Chinese poem dataset is ``chinese-poem``. spls: list[str] All samples in the dataset. ver: str Version of the dataset. vers: typing.ClassVar[list[str]] All available versions of the dataset. Versions are named after their appearing dynasty, including ``元``, ``元末明初``, ``先秦``, ``南北朝``, ``唐``, ``唐末宋初``, ``宋``, ``宋末元初``, ``宋末金初``, ``明``, ``明末清初``, ``民國末當代初``, ``清``, ``清末民國初``, ``清末近現代初``, ``漢``, ``當代``, ``秦``, ``近現代``, ``近現代末當代初``, ``遼``, ``金``, ``金末元初``, ``隋``, ``隋末唐初``, ``魏晉``, ``魏晉末南北朝初``. See Also -------- :doc:`lmp.dset </dset/index>` All available datasets. Examples -------- >>> from lmp.dset import ChPoemDset >>> dset = ChPoemDset(ver='唐') >>> dset[0][:10] '風淅淅。夜雨連雲黑。' """ df_ver: ClassVar[str] = '唐' dset_name: ClassVar[str] = 'chinese-poem' vers: ClassVar[List[str]] = [ '元', '元末明初', '先秦', '南北朝', '唐', '唐末宋初', '宋', '宋末元初', '宋末金初', '明', '明末清初', '民國末當代初', '清', '清末民國初', '清末近現代初', '漢', '當代', '秦', '近現代', '近現代末當代初', '遼', '金', '金末元初', '隋', '隋末唐初', '魏晉', '魏晉末南北朝初', ] def __init__(self, *, ver: Optional[str] = None): super().__init__(ver=ver) # Make sure dataset files exist. self.download_dataset(ver=self.ver) # Read text file inside chinese poem zip file. df = pd.read_csv(os.path.join(lmp.vars.DATA_PATH, f'{self.ver}.csv')) # Normalize dataset. spls = df['內容'].apply(str).apply(self.norm).tolist() # Discard samples having fewer than 24 characters. spls = list(filter(lambda spl: len(spl) >= 24, spls)) self.spls.extend(spls)
[docs] @classmethod def download_dataset(cls, ver: str) -> None: """Download Chinese poem dataset. Download zip file from GitHub and extract raw file from zip file. Raw file is named as ``ver.csv``, where ``ver`` is the version of the dataset. Zip file is deleted after extracting raw file. Parameters ---------- ver: str Version of the dataset. Returns ------- None """ # `ver` validation. lmp.util.validate.raise_if_not_instance(val=ver, val_name='ver', val_type=str) lmp.util.validate.raise_if_not_in(val=ver, val_name='ver', val_range=cls.vers) # Download zip file path. zip_file_path = os.path.join(lmp.vars.DATA_PATH, f'{ver}.csv.zip') # We host this dataset on GitHub. url = f'https://github.com/ProFatXuanAll/demo-dataset/raw/main/ch-poem/{ver}.csv.zip' # Avoid duplicated download by checking whether raw file exists. raw_file_path = os.path.join(lmp.vars.DATA_PATH, f'{ver}.csv') if os.path.exists(raw_file_path): return # Download dataset. BaseDset.download_file(mode='binary', download_path=zip_file_path, url=url) # Extract dataset from zip file. with zipfile.ZipFile(os.path.join(zip_file_path), 'r') as input_zipfile: with io.TextIOWrapper(input_zipfile.open(f'{ver}.csv', 'r'), encoding='utf-8') as input_binary_file: data = input_binary_file.read() with open(os.path.join(lmp.vars.DATA_PATH, f'{ver}.csv'), 'w') as output_text_file: output_text_file.write(data) # Remove downloaded zip file. os.remove(zip_file_path)