You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
284 lines
7.3 KiB
284 lines
7.3 KiB
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# 6.3 语言模型数据集(周杰伦专辑歌词)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"0.4.1\n",
|
|
"cpu\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import torch\n",
|
|
"import random\n",
|
|
"import zipfile\n",
|
|
"\n",
|
|
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
|
"print(torch.__version__)\n",
|
|
"print(device)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 6.3.1 读取数据集"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'想要有直升机\\n想要和你飞到宇宙去\\n想要和你融化在一起\\n融化在宇宙里\\n我每天每天每'"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"with zipfile.ZipFile('../../data/jaychou_lyrics.txt.zip') as zin:\n",
|
|
" with zin.open('jaychou_lyrics.txt') as f:\n",
|
|
" corpus_chars = f.read().decode('utf-8')\n",
|
|
"corpus_chars[:40]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"corpus_chars = corpus_chars.replace('\\n', ' ').replace('\\r', ' ')\n",
|
|
"corpus_chars = corpus_chars[0:10000]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 6.3.2 建立字符索引"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"1027"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"idx_to_char = list(set(corpus_chars))\n",
|
|
"char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])\n",
|
|
"vocab_size = len(char_to_idx)\n",
|
|
"vocab_size"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"chars: 想要有直升机 想要和你飞到宇宙去 想要和\n",
|
|
"indices: [981, 858, 519, 53, 577, 1005, 299, 981, 858, 856, 550, 956, 672, 948, 1003, 334, 299, 981, 858, 856]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"corpus_indices = [char_to_idx[char] for char in corpus_chars]\n",
|
|
"sample = corpus_indices[:20]\n",
|
|
"print('chars:', ''.join([idx_to_char[idx] for idx in sample]))\n",
|
|
"print('indices:', sample)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 6.3.3 时序数据的采样\n",
|
|
"### 6.3.3.1 随机采样"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# 本函数已保存在d2lzh_pytorch包中方便以后使用\n",
|
|
"def data_iter_random(corpus_indices, batch_size, num_steps, device=None):\n",
|
|
" # 减1是因为输出的索引x是相应输入的索引y加1\n",
|
|
" num_examples = (len(corpus_indices) - 1) // num_steps\n",
|
|
" epoch_size = num_examples // batch_size\n",
|
|
" example_indices = list(range(num_examples))\n",
|
|
" random.shuffle(example_indices)\n",
|
|
"\n",
|
|
" # 返回从pos开始的长为num_steps的序列\n",
|
|
" def _data(pos):\n",
|
|
" return corpus_indices[pos: pos + num_steps]\n",
|
|
" if device is None:\n",
|
|
" device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
|
" \n",
|
|
" for i in range(epoch_size):\n",
|
|
" # 每次读取batch_size个随机样本\n",
|
|
" i = i * batch_size\n",
|
|
" batch_indices = example_indices[i: i + batch_size]\n",
|
|
" X = [_data(j * num_steps) for j in batch_indices]\n",
|
|
" Y = [_data(j * num_steps + 1) for j in batch_indices]\n",
|
|
" yield torch.tensor(X, dtype=torch.float32, device=device), torch.tensor(Y, dtype=torch.float32, device=device)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"X: tensor([[18., 19., 20., 21., 22., 23.],\n",
|
|
" [12., 13., 14., 15., 16., 17.]]) \n",
|
|
"Y: tensor([[19., 20., 21., 22., 23., 24.],\n",
|
|
" [13., 14., 15., 16., 17., 18.]]) \n",
|
|
"\n",
|
|
"X: tensor([[ 0., 1., 2., 3., 4., 5.],\n",
|
|
" [ 6., 7., 8., 9., 10., 11.]]) \n",
|
|
"Y: tensor([[ 1., 2., 3., 4., 5., 6.],\n",
|
|
" [ 7., 8., 9., 10., 11., 12.]]) \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"my_seq = list(range(30))\n",
|
|
"for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):\n",
|
|
" print('X: ', X, '\\nY:', Y, '\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### 6.3.3.2 相邻采样"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# 本函数已保存在d2lzh_pytorch包中方便以后使用\n",
|
|
"def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):\n",
|
|
" if device is None:\n",
|
|
" device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
|
|
" corpus_indices = torch.tensor(corpus_indices, dtype=torch.float32, device=device)\n",
|
|
" data_len = len(corpus_indices)\n",
|
|
" batch_len = data_len // batch_size\n",
|
|
" indices = corpus_indices[0: batch_size*batch_len].view(batch_size, batch_len)\n",
|
|
" epoch_size = (batch_len - 1) // num_steps\n",
|
|
" for i in range(epoch_size):\n",
|
|
" i = i * num_steps\n",
|
|
" X = indices[:, i: i + num_steps]\n",
|
|
" Y = indices[:, i + 1: i + num_steps + 1]\n",
|
|
" yield X, Y"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"X: tensor([[ 0., 1., 2., 3., 4., 5.],\n",
|
|
" [15., 16., 17., 18., 19., 20.]]) \n",
|
|
"Y: tensor([[ 1., 2., 3., 4., 5., 6.],\n",
|
|
" [16., 17., 18., 19., 20., 21.]]) \n",
|
|
"\n",
|
|
"X: tensor([[ 6., 7., 8., 9., 10., 11.],\n",
|
|
" [21., 22., 23., 24., 25., 26.]]) \n",
|
|
"Y: tensor([[ 7., 8., 9., 10., 11., 12.],\n",
|
|
" [22., 23., 24., 25., 26., 27.]]) \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):\n",
|
|
" print('X: ', X, '\\nY:', Y, '\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python [default]",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|