{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 6.3 语言模型数据集(周杰伦专辑歌词)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.4.1\n", "cpu\n" ] } ], "source": [ "import torch\n", "import random\n", "import zipfile\n", "\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "print(torch.__version__)\n", "print(device)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.3.1 读取数据集" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'想要有直升机\\n想要和你飞到宇宙去\\n想要和你融化在一起\\n融化在宇宙里\\n我每天每天每'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with zipfile.ZipFile('../../data/jaychou_lyrics.txt.zip') as zin:\n", " with zin.open('jaychou_lyrics.txt') as f:\n", " corpus_chars = f.read().decode('utf-8')\n", "corpus_chars[:40]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "corpus_chars = corpus_chars.replace('\\n', ' ').replace('\\r', ' ')\n", "corpus_chars = corpus_chars[0:10000]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.3.2 建立字符索引" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1027" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "idx_to_char = list(set(corpus_chars))\n", "char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])\n", "vocab_size = len(char_to_idx)\n", "vocab_size" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "chars: 想要有直升机 想要和你飞到宇宙去 想要和\n", "indices: [981, 858, 519, 53, 577, 1005, 299, 981, 858, 856, 550, 956, 672, 948, 1003, 334, 299, 981, 858, 856]\n" ] } ], "source": [ "corpus_indices = [char_to_idx[char] for char in corpus_chars]\n", "sample = corpus_indices[:20]\n", "print('chars:', ''.join([idx_to_char[idx] for idx in sample]))\n", "print('indices:', sample)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.3.3 时序数据的采样\n", "### 6.3.3.1 随机采样" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 本函数已保存在d2lzh_pytorch包中方便以后使用\n", "def data_iter_random(corpus_indices, batch_size, num_steps, device=None):\n", " # 减1是因为输出的索引x是相应输入的索引y加1\n", " num_examples = (len(corpus_indices) - 1) // num_steps\n", " epoch_size = num_examples // batch_size\n", " example_indices = list(range(num_examples))\n", " random.shuffle(example_indices)\n", "\n", " # 返回从pos开始的长为num_steps的序列\n", " def _data(pos):\n", " return corpus_indices[pos: pos + num_steps]\n", " if device is None:\n", " device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", " \n", " for i in range(epoch_size):\n", " # 每次读取batch_size个随机样本\n", " i = i * batch_size\n", " batch_indices = example_indices[i: i + batch_size]\n", " X = [_data(j * num_steps) for j in batch_indices]\n", " Y = [_data(j * num_steps + 1) for j in batch_indices]\n", " yield torch.tensor(X, dtype=torch.float32, device=device), torch.tensor(Y, dtype=torch.float32, device=device)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X: tensor([[18., 19., 20., 21., 22., 23.],\n", " [12., 13., 14., 15., 16., 17.]]) \n", "Y: tensor([[19., 20., 21., 22., 23., 24.],\n", " [13., 14., 15., 16., 17., 18.]]) \n", "\n", "X: tensor([[ 0., 1., 2., 3., 4., 5.],\n", " [ 6., 7., 8., 9., 10., 11.]]) \n", "Y: tensor([[ 1., 2., 3., 4., 5., 6.],\n", " [ 7., 8., 9., 10., 11., 12.]]) \n", "\n" ] } ], "source": [ "my_seq = list(range(30))\n", "for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):\n", " print('X: ', X, '\\nY:', Y, '\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.3.3.2 相邻采样" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 本函数已保存在d2lzh_pytorch包中方便以后使用\n", "def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):\n", " if device is None:\n", " device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", " corpus_indices = torch.tensor(corpus_indices, dtype=torch.float32, device=device)\n", " data_len = len(corpus_indices)\n", " batch_len = data_len // batch_size\n", " indices = corpus_indices[0: batch_size*batch_len].view(batch_size, batch_len)\n", " epoch_size = (batch_len - 1) // num_steps\n", " for i in range(epoch_size):\n", " i = i * num_steps\n", " X = indices[:, i: i + num_steps]\n", " Y = indices[:, i + 1: i + num_steps + 1]\n", " yield X, Y" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X: tensor([[ 0., 1., 2., 3., 4., 5.],\n", " [15., 16., 17., 18., 19., 20.]]) \n", "Y: tensor([[ 1., 2., 3., 4., 5., 6.],\n", " [16., 17., 18., 19., 20., 21.]]) \n", "\n", "X: tensor([[ 6., 7., 8., 9., 10., 11.],\n", " [21., 22., 23., 24., 25., 26.]]) \n", "Y: tensor([[ 7., 8., 9., 10., 11., 12.],\n", " [22., 23., 24., 25., 26., 27.]]) \n", "\n" ] } ], "source": [ "for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):\n", " print('X: ', X, '\\nY:', Y, '\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }