You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

284 lines
7.3 KiB

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6.3 语言模型数据集(周杰伦专辑歌词)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.4.1\n",
"cpu\n"
]
}
],
"source": [
"import torch\n",
"import random\n",
"import zipfile\n",
"\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"print(torch.__version__)\n",
"print(device)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6.3.1 读取数据集"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'想要有直升机\\n想要和你飞到宇宙去\\n想要和你融化在一起\\n融化在宇宙里\\n我每天每天每'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with zipfile.ZipFile('../../data/jaychou_lyrics.txt.zip') as zin:\n",
" with zin.open('jaychou_lyrics.txt') as f:\n",
" corpus_chars = f.read().decode('utf-8')\n",
"corpus_chars[:40]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"corpus_chars = corpus_chars.replace('\\n', ' ').replace('\\r', ' ')\n",
"corpus_chars = corpus_chars[0:10000]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6.3.2 建立字符索引"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1027"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idx_to_char = list(set(corpus_chars))\n",
"char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])\n",
"vocab_size = len(char_to_idx)\n",
"vocab_size"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"chars: 想要有直升机 想要和你飞到宇宙去 想要和\n",
"indices: [981, 858, 519, 53, 577, 1005, 299, 981, 858, 856, 550, 956, 672, 948, 1003, 334, 299, 981, 858, 856]\n"
]
}
],
"source": [
"corpus_indices = [char_to_idx[char] for char in corpus_chars]\n",
"sample = corpus_indices[:20]\n",
"print('chars:', ''.join([idx_to_char[idx] for idx in sample]))\n",
"print('indices:', sample)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6.3.3 时序数据的采样\n",
"### 6.3.3.1 随机采样"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 本函数已保存在d2lzh_pytorch包中方便以后使用\n",
"def data_iter_random(corpus_indices, batch_size, num_steps, device=None):\n",
" # 减1是因为输出的索引x是相应输入的索引y加1\n",
" num_examples = (len(corpus_indices) - 1) // num_steps\n",
" epoch_size = num_examples // batch_size\n",
" example_indices = list(range(num_examples))\n",
" random.shuffle(example_indices)\n",
"\n",
" # 返回从pos开始的长为num_steps的序列\n",
" def _data(pos):\n",
" return corpus_indices[pos: pos + num_steps]\n",
" if device is None:\n",
" device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
" \n",
" for i in range(epoch_size):\n",
" # 每次读取batch_size个随机样本\n",
" i = i * batch_size\n",
" batch_indices = example_indices[i: i + batch_size]\n",
" X = [_data(j * num_steps) for j in batch_indices]\n",
" Y = [_data(j * num_steps + 1) for j in batch_indices]\n",
" yield torch.tensor(X, dtype=torch.float32, device=device), torch.tensor(Y, dtype=torch.float32, device=device)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X: tensor([[18., 19., 20., 21., 22., 23.],\n",
" [12., 13., 14., 15., 16., 17.]]) \n",
"Y: tensor([[19., 20., 21., 22., 23., 24.],\n",
" [13., 14., 15., 16., 17., 18.]]) \n",
"\n",
"X: tensor([[ 0., 1., 2., 3., 4., 5.],\n",
" [ 6., 7., 8., 9., 10., 11.]]) \n",
"Y: tensor([[ 1., 2., 3., 4., 5., 6.],\n",
" [ 7., 8., 9., 10., 11., 12.]]) \n",
"\n"
]
}
],
"source": [
"my_seq = list(range(30))\n",
"for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):\n",
" print('X: ', X, '\\nY:', Y, '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 6.3.3.2 相邻采样"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 本函数已保存在d2lzh_pytorch包中方便以后使用\n",
"def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):\n",
" if device is None:\n",
" device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
" corpus_indices = torch.tensor(corpus_indices, dtype=torch.float32, device=device)\n",
" data_len = len(corpus_indices)\n",
" batch_len = data_len // batch_size\n",
" indices = corpus_indices[0: batch_size*batch_len].view(batch_size, batch_len)\n",
" epoch_size = (batch_len - 1) // num_steps\n",
" for i in range(epoch_size):\n",
" i = i * num_steps\n",
" X = indices[:, i: i + num_steps]\n",
" Y = indices[:, i + 1: i + num_steps + 1]\n",
" yield X, Y"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X: tensor([[ 0., 1., 2., 3., 4., 5.],\n",
" [15., 16., 17., 18., 19., 20.]]) \n",
"Y: tensor([[ 1., 2., 3., 4., 5., 6.],\n",
" [16., 17., 18., 19., 20., 21.]]) \n",
"\n",
"X: tensor([[ 6., 7., 8., 9., 10., 11.],\n",
" [21., 22., 23., 24., 25., 26.]]) \n",
"Y: tensor([[ 7., 8., 9., 10., 11., 12.],\n",
" [22., 23., 24., 25., 26., 27.]]) \n",
"\n"
]
}
],
"source": [
"for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):\n",
" print('X: ', X, '\\nY:', Y, '\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}