"""Chinese poetry dataset."""
import io
import os
import zipfile
from typing import ClassVar, List, Optional
# Typeshed for `pandas` is under development, we will ignore type check on `pandas` until `pandas` typeshed finish its
# development and release stable version.
import pandas as pd # type: ignore
import lmp.vars
from lmp.dset._base import BaseDset
[docs]class ChPoemDset(BaseDset):
"""Poems of ancient Chinese dynasty.
See https://github.com/Werneror/Poetry for details on dataset.
See https://github.com/ProFatXuanAll/demo-dataset for dataset preprocessing details.
Here we list some dataset statistics.
+--------------------+-----------------+-------------------+
| dynasty | number of poems | number of authors |
+====================+=================+===================+
| ``宋`` | 287114 | 9446 |
+--------------------+-----------------+-------------------+
| ``明`` | 236957 | 4439 |
+--------------------+-----------------+-------------------+
| ``清`` | 90089 | 8872 |
+--------------------+-----------------+-------------------+
| ``唐`` | 49195 | 2736 |
+--------------------+-----------------+-------------------+
| ``元`` | 37375 | 1209 |
+--------------------+-----------------+-------------------+
| ``近現代`` | 28419 | 790 |
+--------------------+-----------------+-------------------+
| ``當代`` | 28219 | 177 |
+--------------------+-----------------+-------------------+
| ``明末清初`` | 17700 | 176 |
+--------------------+-----------------+-------------------+
| ``元末明初`` | 15736 | 79 |
+--------------------+-----------------+-------------------+
| ``清末民國初`` | 15367 | 99 |
+--------------------+-----------------+-------------------+
| ``清末近現代初`` | 12464 | 48 |
+--------------------+-----------------+-------------------+
| ``宋末元初`` | 12058 | 41 |
+--------------------+-----------------+-------------------+
| ``南北朝`` | 4586 | 434 |
+--------------------+-----------------+-------------------+
| ``近現代末當代初`` | 3426 | 23 |
+--------------------+-----------------+-------------------+
| ``魏晉`` | 3020 | 251 |
+--------------------+-----------------+-------------------+
| ``金末元初`` | 3019 | 17 |
+--------------------+-----------------+-------------------+
| ``金`` | 2741 | 253 |
+--------------------+-----------------+-------------------+
| ``民國末當代初`` | 1948 | 9 |
+--------------------+-----------------+-------------------+
| ``隋`` | 1170 | 84 |
+--------------------+-----------------+-------------------+
| ``唐末宋初`` | 1118 | 44 |
+--------------------+-----------------+-------------------+
| ``先秦`` | 570 | 8 |
+--------------------+-----------------+-------------------+
| ``隋末唐初`` | 472 | 40 |
+--------------------+-----------------+-------------------+
| ``漢`` | 363 | 83 |
+--------------------+-----------------+-------------------+
| ``宋末金初`` | 234 | 9 |
+--------------------+-----------------+-------------------+
| ``遼`` | 22 | 7 |
+--------------------+-----------------+-------------------+
| ``秦`` | 2 | 2 |
+--------------------+-----------------+-------------------+
| ``魏晉末南北朝初`` | 1 | 1 |
+--------------------+-----------------+-------------------+
| total | 853385 | 29377 |
+--------------------+-----------------+-------------------+
Parameters
----------
ver: Optional[str], default: None
Version of the dataset.
Set to ``None`` to use the default version ``self.__class__.df_ver``.
Attributes
----------
df_ver: typing.ClassVar[str]
Default version is ``'唐'``.
dset_name: typing.ClassVar[str]
CLI name of Chinese poem dataset is ``chinese-poem``.
spls: list[str]
All samples in the dataset.
ver: str
Version of the dataset.
vers: typing.ClassVar[list[str]]
All available versions of the dataset.
Versions are named after their appearing dynasty, including ``元``, ``元末明初``, ``先秦``, ``南北朝``, ``唐``,
``唐末宋初``, ``宋``, ``宋末元初``, ``宋末金初``, ``明``, ``明末清初``, ``民國末當代初``, ``清``, ``清末民國初``,
``清末近現代初``, ``漢``, ``當代``, ``秦``, ``近現代``, ``近現代末當代初``, ``遼``, ``金``, ``金末元初``, ``隋``,
``隋末唐初``, ``魏晉``, ``魏晉末南北朝初``.
See Also
--------
:doc:`lmp.dset </dset/index>`
All available datasets.
Examples
--------
>>> from lmp.dset import ChPoemDset
>>> dset = ChPoemDset(ver='唐')
>>> dset[0][:10]
'風淅淅。夜雨連雲黑。'
"""
df_ver: ClassVar[str] = '唐'
dset_name: ClassVar[str] = 'chinese-poem'
vers: ClassVar[List[str]] = [
'元',
'元末明初',
'先秦',
'南北朝',
'唐',
'唐末宋初',
'宋',
'宋末元初',
'宋末金初',
'明',
'明末清初',
'民國末當代初',
'清',
'清末民國初',
'清末近現代初',
'漢',
'當代',
'秦',
'近現代',
'近現代末當代初',
'遼',
'金',
'金末元初',
'隋',
'隋末唐初',
'魏晉',
'魏晉末南北朝初',
]
def __init__(self, *, ver: Optional[str] = None):
super().__init__(ver=ver)
# Make sure dataset files exist.
self.download_dataset(ver=self.ver)
# Read text file inside chinese poem zip file.
df = pd.read_csv(os.path.join(lmp.vars.DATA_PATH, f'{self.ver}.csv'))
# Normalize dataset.
spls = df['內容'].apply(str).apply(self.norm).tolist()
# Discard samples having fewer than 24 characters.
spls = list(filter(lambda spl: len(spl) >= 24, spls))
self.spls.extend(spls)
[docs] @classmethod
def download_dataset(cls, ver: str) -> None:
"""Download Chinese poem dataset.
Download zip file from GitHub and extract raw file from zip file.
Raw file is named as ``ver.csv``, where ``ver`` is the version of the dataset.
Zip file is deleted after extracting raw file.
Parameters
----------
ver: str
Version of the dataset.
Returns
-------
None
"""
# `ver` validation.
lmp.util.validate.raise_if_not_instance(val=ver, val_name='ver', val_type=str)
lmp.util.validate.raise_if_not_in(val=ver, val_name='ver', val_range=cls.vers)
# Download zip file path.
zip_file_path = os.path.join(lmp.vars.DATA_PATH, f'{ver}.csv.zip')
# We host this dataset on GitHub.
url = f'https://github.com/ProFatXuanAll/demo-dataset/raw/main/ch-poem/{ver}.csv.zip'
# Avoid duplicated download by checking whether raw file exists.
raw_file_path = os.path.join(lmp.vars.DATA_PATH, f'{ver}.csv')
if os.path.exists(raw_file_path):
return
# Download dataset.
BaseDset.download_file(mode='binary', download_path=zip_file_path, url=url)
# Extract dataset from zip file.
with zipfile.ZipFile(os.path.join(zip_file_path), 'r') as input_zipfile:
with io.TextIOWrapper(input_zipfile.open(f'{ver}.csv', 'r'), encoding='utf-8') as input_binary_file:
data = input_binary_file.read()
with open(os.path.join(lmp.vars.DATA_PATH, f'{ver}.csv'), 'w') as output_text_file:
output_text_file.write(data)
# Remove downloaded zip file.
os.remove(zip_file_path)