Dive-into-DL-PyTorch/code/chapter06_RNN/6.3_lang-model-dataset.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6.3 语言模型数据集（周杰伦专辑歌词）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.4.1\n",
      "cpu\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import random\n",
    "import zipfile\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "print(torch.__version__)\n",
    "print(device)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.3.1 读取数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'想要有直升机\\n想要和你飞到宇宙去\\n想要和你融化在一起\\n融化在宇宙里\\n我每天每天每'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with zipfile.ZipFile('../../data/jaychou_lyrics.txt.zip') as zin:\n",
    "    with zin.open('jaychou_lyrics.txt') as f:\n",
    "        corpus_chars = f.read().decode('utf-8')\n",
    "corpus_chars[:40]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "corpus_chars = corpus_chars.replace('\\n', ' ').replace('\\r', ' ')\n",
    "corpus_chars = corpus_chars[0:10000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.3.2 建立字符索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1027"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx_to_char = list(set(corpus_chars))\n",
    "char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])\n",
    "vocab_size = len(char_to_idx)\n",
    "vocab_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "chars: 想要有直升机 想要和你飞到宇宙去 想要和\n",
      "indices: [981, 858, 519, 53, 577, 1005, 299, 981, 858, 856, 550, 956, 672, 948, 1003, 334, 299, 981, 858, 856]\n"
     ]
    }
   ],
   "source": [
    "corpus_indices = [char_to_idx[char] for char in corpus_chars]\n",
    "sample = corpus_indices[:20]\n",
    "print('chars:', ''.join([idx_to_char[idx] for idx in sample]))\n",
    "print('indices:', sample)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.3.3 时序数据的采样\n",
    "### 6.3.3.1 随机采样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 本函数已保存在d2lzh_pytorch包中方便以后使用\n",
    "def data_iter_random(corpus_indices, batch_size, num_steps, device=None):\n",
    "    # 减1是因为输出的索引x是相应输入的索引y加1\n",
    "    num_examples = (len(corpus_indices) - 1) // num_steps\n",
    "    epoch_size = num_examples // batch_size\n",
    "    example_indices = list(range(num_examples))\n",
    "    random.shuffle(example_indices)\n",
    "\n",
    "    # 返回从pos开始的长为num_steps的序列\n",
    "    def _data(pos):\n",
    "        return corpus_indices[pos: pos + num_steps]\n",
    "    if device is None:\n",
    "        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "    \n",
    "    for i in range(epoch_size):\n",
    "        # 每次读取batch_size个随机样本\n",
    "        i = i * batch_size\n",
    "        batch_indices = example_indices[i: i + batch_size]\n",
    "        X = [_data(j * num_steps) for j in batch_indices]\n",
    "        Y = [_data(j * num_steps + 1) for j in batch_indices]\n",
    "        yield torch.tensor(X, dtype=torch.float32, device=device), torch.tensor(Y, dtype=torch.float32, device=device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X:  tensor([[18., 19., 20., 21., 22., 23.],\n",
      "        [12., 13., 14., 15., 16., 17.]]) \n",
      "Y: tensor([[19., 20., 21., 22., 23., 24.],\n",
      "        [13., 14., 15., 16., 17., 18.]]) \n",
      "\n",
      "X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],\n",
      "        [ 6.,  7.,  8.,  9., 10., 11.]]) \n",
      "Y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],\n",
      "        [ 7.,  8.,  9., 10., 11., 12.]]) \n",
      "\n"
     ]
    }
   ],
   "source": [
    "my_seq = list(range(30))\n",
    "for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):\n",
    "    print('X: ', X, '\\nY:', Y, '\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.3.3.2 相邻采样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 本函数已保存在d2lzh_pytorch包中方便以后使用\n",
    "def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):\n",
    "    if device is None:\n",
    "        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "    corpus_indices = torch.tensor(corpus_indices, dtype=torch.float32, device=device)\n",
    "    data_len = len(corpus_indices)\n",
    "    batch_len = data_len // batch_size\n",
    "    indices = corpus_indices[0: batch_size*batch_len].view(batch_size, batch_len)\n",
    "    epoch_size = (batch_len - 1) // num_steps\n",
    "    for i in range(epoch_size):\n",
    "        i = i * num_steps\n",
    "        X = indices[:, i: i + num_steps]\n",
    "        Y = indices[:, i + 1: i + num_steps + 1]\n",
    "        yield X, Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],\n",
      "        [15., 16., 17., 18., 19., 20.]]) \n",
      "Y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],\n",
      "        [16., 17., 18., 19., 20., 21.]]) \n",
      "\n",
      "X:  tensor([[ 6.,  7.,  8.,  9., 10., 11.],\n",
      "        [21., 22., 23., 24., 25., 26.]]) \n",
      "Y: tensor([[ 7.,  8.,  9., 10., 11., 12.],\n",
      "        [22., 23., 24., 25., 26., 27.]]) \n",
      "\n"
     ]
    }
   ],
   "source": [
    "for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):\n",
    "    print('X: ', X, '\\nY:', Y, '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}