{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 3.16 实战Kaggle比赛:房价预测" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.2.0\n" ] } ], "source": [ "# 如果没有安装pandas,则反注释下面一行\n", "# !pip install pandas\n", "\n", "%matplotlib inline\n", "import torch\n", "import torch.nn as nn\n", "import numpy as np\n", "import pandas as pd\n", "import sys\n", "sys.path.append(\"..\") \n", "import d2lzh_pytorch as d2l\n", "\n", "print(torch.__version__)\n", "torch.set_default_tensor_type(torch.FloatTensor)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.16.2 获取和读取数据集" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "train_data = pd.read_csv('../../data/kaggle_house/train.csv')\n", "test_data = pd.read_csv('../../data/kaggle_house/test.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1460, 81)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_data.shape" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1459, 80)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data.shape" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassMSZoningLotFrontageSaleTypeSaleConditionSalePrice
0160RL65.0WDNormal208500
1220RL80.0WDNormal181500
2360RL68.0WDNormal223500
3470RL60.0WDAbnorml140000
\n", "
" ], "text/plain": [ " Id MSSubClass MSZoning LotFrontage SaleType SaleCondition SalePrice\n", "0 1 60 RL 65.0 WD Normal 208500\n", "1 2 20 RL 80.0 WD Normal 181500\n", "2 3 60 RL 68.0 WD Normal 223500\n", "3 4 70 RL 60.0 WD Abnorml 140000" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.16.3 预处理数据" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index\n", "all_features[numeric_features] = all_features[numeric_features].apply(\n", " lambda x: (x - x.mean()) / (x.std()))\n", "# 标准化后,每个数值特征的均值变为0,所以可以直接用0来替换缺失值\n", "all_features[numeric_features] = all_features[numeric_features].fillna(0)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2919, 331)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# dummy_na=True将缺失值也当作合法的特征值并为其创建指示特征\n", "all_features = pd.get_dummies(all_features, dummy_na=True)\n", "all_features.shape" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "n_train = train_data.shape[0]\n", "train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)\n", "test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)\n", "train_labels = torch.tensor(train_data.SalePrice.values, dtype=torch.float).view(-1, 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.16.4 训练模型" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "loss = torch.nn.MSELoss()\n", "\n", "def get_net(feature_num):\n", " net = nn.Linear(feature_num, 1)\n", " for param in net.parameters():\n", " nn.init.normal_(param, mean=0, std=0.01)\n", " return net" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def log_rmse(net, features, labels):\n", " with torch.no_grad():\n", " # 将小于1的值设成1,使得取对数时数值更稳定\n", " clipped_preds = torch.max(net(features), torch.tensor(1.0))\n", " rmse = torch.sqrt(loss(clipped_preds.log(), labels.log()))\n", " return rmse.item()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def train(net, train_features, train_labels, test_features, test_labels,\n", " num_epochs, learning_rate, weight_decay, batch_size):\n", " train_ls, test_ls = [], []\n", " dataset = torch.utils.data.TensorDataset(train_features, train_labels)\n", " train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)\n", " # 这里使用了Adam优化算法\n", " optimizer = torch.optim.Adam(params=net.parameters(), lr=learning_rate, weight_decay=weight_decay) \n", " net = net.float()\n", " for epoch in range(num_epochs):\n", " for X, y in train_iter:\n", " l = loss(net(X.float()), y.float())\n", " optimizer.zero_grad()\n", " l.backward()\n", " optimizer.step()\n", " train_ls.append(log_rmse(net, train_features, train_labels))\n", " if test_labels is not None:\n", " test_ls.append(log_rmse(net, test_features, test_labels))\n", " return train_ls, test_ls" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.16.5 $K$折交叉验证" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def get_k_fold_data(k, i, X, y):\n", " # 返回第i折交叉验证时所需要的训练和验证数据\n", " assert k > 1\n", " fold_size = X.shape[0] // k\n", " X_train, y_train = None, None\n", " for j in range(k):\n", " idx = slice(j * fold_size, (j + 1) * fold_size)\n", " X_part, y_part = X[idx, :], y[idx]\n", " if j == i:\n", " X_valid, y_valid = X_part, y_part\n", " elif X_train is None:\n", " X_train, y_train = X_part, y_part\n", " else:\n", " X_train = torch.cat((X_train, X_part), dim=0)\n", " y_train = torch.cat((y_train, y_part), dim=0)\n", " return X_train, y_train, X_valid, y_valid" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def k_fold(k, X_train, y_train, num_epochs,\n", " learning_rate, weight_decay, batch_size):\n", " train_l_sum, valid_l_sum = 0, 0\n", " for i in range(k):\n", " data = get_k_fold_data(k, i, X_train, y_train)\n", " net = get_net(X_train.shape[1])\n", " train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,\n", " weight_decay, batch_size)\n", " train_l_sum += train_ls[-1]\n", " valid_l_sum += valid_ls[-1]\n", " if i == 0:\n", " d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',\n", " range(1, num_epochs + 1), valid_ls,\n", " ['train', 'valid'])\n", " print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))\n", " return train_l_sum / k, valid_l_sum / k" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.16.6 模型选择" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "fold 0, train rmse 0.170585, valid rmse 0.156860\n", "fold 1, train rmse 0.162552, valid rmse 0.190944\n", "fold 2, train rmse 0.164199, valid rmse 0.168767\n", "fold 3, train rmse 0.168698, valid rmse 0.154873\n", "fold 4, train rmse 0.163213, valid rmse 0.183080\n", "5-fold validation: avg train rmse 0.165849, avg valid rmse 0.170905\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64\n", "train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)\n", "print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.16.7 预测并在Kaggle提交结果" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def train_and_pred(train_features, test_features, train_labels, test_data,\n", " num_epochs, lr, weight_decay, batch_size):\n", " net = get_net(train_features.shape[1])\n", " train_ls, _ = train(net, train_features, train_labels, None, None,\n", " num_epochs, lr, weight_decay, batch_size)\n", " d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')\n", " print('train rmse %f' % train_ls[-1])\n", " preds = net(test_features).detach().numpy()\n", " test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])\n", " submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)\n", " # submission.to_csv('./submission.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train rmse 0.162085\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }