{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 9.11 样式迁移"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cuda 1.1.0\n"
]
}
],
"source": [
"%matplotlib inline\n",
"import time\n",
"import torch\n",
"import torch.nn.functional as F\n",
"import torchvision\n",
"import numpy as np\n",
"from PIL import Image\n",
"\n",
"import sys\n",
"sys.path.append(\"..\") \n",
"import d2lzh_pytorch as d2l\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 均已测试\n",
"\n",
"print(device, torch.__version__)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9.11.2 读取内容图像和样式图像"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"d2l.set_figsize()\n",
"content_img = Image.open('../../data/rainier.jpg')\n",
"d2l.plt.imshow(content_img);"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"d2l.set_figsize()\n",
"style_img = Image.open('../../data/autumn_oak.jpg')\n",
"d2l.plt.imshow(style_img);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9.11.3. 预处理和后处理图像"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"rgb_mean = np.array([0.485, 0.456, 0.406])\n",
"rgb_std = np.array([0.229, 0.224, 0.225])\n",
"\n",
"def preprocess(PIL_img, image_shape):\n",
" process = torchvision.transforms.Compose([\n",
" torchvision.transforms.Resize(image_shape),\n",
" torchvision.transforms.ToTensor(),\n",
" torchvision.transforms.Normalize(mean=rgb_mean, std=rgb_std)])\n",
"\n",
" return process(PIL_img).unsqueeze(dim = 0) # (batch_size, 3, H, W)\n",
"\n",
"def postprocess(img_tensor):\n",
" inv_normalize = torchvision.transforms.Normalize(\n",
" mean= -rgb_mean / rgb_std,\n",
" std= 1/rgb_std)\n",
" to_PIL_image = torchvision.transforms.ToPILImage()\n",
" return to_PIL_image(inv_normalize(img_tensor[0].cpu()).clamp(0, 1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9.11.4 抽取特征"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/data1/tangss/PyTorch_pretrainedmodels\r\n"
]
}
],
"source": [
"!echo $TORCH_HOME # 将会把预训练好的模型下载到此处(没有输出的话默认是.cache/torch)\n",
"pretrained_net = torchvision.models.vgg19(pretrained=True, progress=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"VGG(\n",
" (features): Sequential(\n",
" (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (1): ReLU(inplace)\n",
" (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (3): ReLU(inplace)\n",
" (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
" (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (6): ReLU(inplace)\n",
" (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (8): ReLU(inplace)\n",
" (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
" (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (11): ReLU(inplace)\n",
" (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (13): ReLU(inplace)\n",
" (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (15): ReLU(inplace)\n",
" (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (17): ReLU(inplace)\n",
" (18): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
" (19): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (20): ReLU(inplace)\n",
" (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (22): ReLU(inplace)\n",
" (23): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (24): ReLU(inplace)\n",
" (25): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (26): ReLU(inplace)\n",
" (27): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
" (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (29): ReLU(inplace)\n",
" (30): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (31): ReLU(inplace)\n",
" (32): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (33): ReLU(inplace)\n",
" (34): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (35): ReLU(inplace)\n",
" (36): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n",
" )\n",
" (avgpool): AdaptiveAvgPool2d(output_size=(7, 7))\n",
" (classifier): Sequential(\n",
" (0): Linear(in_features=25088, out_features=4096, bias=True)\n",
" (1): ReLU(inplace)\n",
" (2): Dropout(p=0.5)\n",
" (3): Linear(in_features=4096, out_features=4096, bias=True)\n",
" (4): ReLU(inplace)\n",
" (5): Dropout(p=0.5)\n",
" (6): Linear(in_features=4096, out_features=1000, bias=True)\n",
" )\n",
")"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pretrained_net"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"style_layers, content_layers = [0, 5, 10, 19, 28], [25]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"net_list = []\n",
"for i in range(max(content_layers + style_layers) + 1):\n",
" net_list.append(pretrained_net.features[i])\n",
"net = torch.nn.Sequential(*net_list)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def extract_features(X, content_layers, style_layers):\n",
" contents = []\n",
" styles = []\n",
" for i in range(len(net)):\n",
" X = net[i](X)\n",
" if i in style_layers:\n",
" styles.append(X)\n",
" if i in content_layers:\n",
" contents.append(X)\n",
" return contents, styles"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_contents(image_shape, device):\n",
" content_X = preprocess(content_img, image_shape).to(device)\n",
" contents_Y, _ = extract_features(content_X, content_layers, style_layers)\n",
" return content_X, contents_Y\n",
"\n",
"def get_styles(image_shape, device):\n",
" style_X = preprocess(style_img, image_shape).to(device)\n",
" _, styles_Y = extract_features(style_X, content_layers, style_layers)\n",
" return style_X, styles_Y"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9.11.5 定义损失函数\n",
"### 9.11.5.1 内容损失"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def content_loss(Y_hat, Y):\n",
" return F.mse_loss(Y_hat, Y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 9.11.5.2 样式损失"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def gram(X):\n",
" num_channels, n = X.shape[1], X.shape[2] * X.shape[3]\n",
" X = X.view(num_channels, n)\n",
" return torch.matmul(X, X.t()) / (num_channels * n)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def style_loss(Y_hat, gram_Y):\n",
" return F.mse_loss(gram(Y_hat), gram_Y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 9.11.5.3 总变差损失"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def tv_loss(Y_hat):\n",
" return 0.5 * (F.l1_loss(Y_hat[:, :, 1:, :], Y_hat[:, :, :-1, :]) + \n",
" F.l1_loss(Y_hat[:, :, :, 1:], Y_hat[:, :, :, :-1]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 9.11.5.4 损失函数"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"content_weight, style_weight, tv_weight = 1, 1e3, 10\n",
"\n",
"def compute_loss(X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram):\n",
" # 分别计算内容损失、样式损失和总变差损失\n",
" contents_l = [content_loss(Y_hat, Y) * content_weight for Y_hat, Y in zip(\n",
" contents_Y_hat, contents_Y)]\n",
" styles_l = [style_loss(Y_hat, Y) * style_weight for Y_hat, Y in zip(\n",
" styles_Y_hat, styles_Y_gram)]\n",
" tv_l = tv_loss(X) * tv_weight\n",
" # 对所有损失求和\n",
" l = sum(styles_l) + sum(contents_l) + tv_l\n",
" return contents_l, styles_l, tv_l, l"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9.11.6 创建和初始化合成图像"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class GeneratedImage(torch.nn.Module):\n",
" def __init__(self, img_shape):\n",
" super(GeneratedImage, self).__init__()\n",
" self.weight = torch.nn.Parameter(torch.rand(*img_shape))\n",
"\n",
" def forward(self):\n",
" return self.weight"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_inits(X, device, lr, styles_Y):\n",
" gen_img = GeneratedImage(X.shape).to(device)\n",
" gen_img.weight.data = X.data\n",
" optimizer = torch.optim.Adam(gen_img.parameters(), lr=lr)\n",
" styles_Y_gram = [gram(Y) for Y in styles_Y]\n",
" return gen_img(), styles_Y_gram, optimizer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9.11.7 训练"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def train(X, contents_Y, styles_Y, device, lr, max_epochs, lr_decay_epoch):\n",
" print(\"training on \", device)\n",
" X, styles_Y_gram, optimizer = get_inits(X, device, lr, styles_Y)\n",
" scheduler = torch.optim.lr_scheduler.StepLR(optimizer, lr_decay_epoch, gamma=0.1)\n",
" for i in range(max_epochs):\n",
" start = time.time()\n",
" \n",
" contents_Y_hat, styles_Y_hat = extract_features(\n",
" X, content_layers, style_layers)\n",
" contents_l, styles_l, tv_l, l = compute_loss(\n",
" X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram)\n",
" \n",
" optimizer.zero_grad()\n",
" l.backward(retain_graph = True)\n",
" optimizer.step()\n",
" scheduler.step()\n",
" \n",
" if i % 50 == 0 and i != 0:\n",
" print('epoch %3d, content loss %.2f, style loss %.2f, '\n",
" 'TV loss %.2f, %.2f sec'\n",
" % (i, sum(contents_l).item(), sum(styles_l).item(), tv_l.item(),\n",
" time.time() - start))\n",
" return X.detach()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"training on cuda\n",
"epoch 50, content loss 0.24, style loss 1.11, TV loss 1.33, 0.07 sec\n",
"epoch 100, content loss 0.24, style loss 0.81, TV loss 1.20, 0.07 sec\n",
"epoch 150, content loss 0.24, style loss 0.72, TV loss 1.12, 0.07 sec\n",
"epoch 200, content loss 0.24, style loss 0.68, TV loss 1.06, 0.07 sec\n",
"epoch 250, content loss 0.23, style loss 0.68, TV loss 1.05, 0.07 sec\n",
"epoch 300, content loss 0.23, style loss 0.67, TV loss 1.04, 0.07 sec\n",
"epoch 350, content loss 0.23, style loss 0.67, TV loss 1.04, 0.07 sec\n",
"epoch 400, content loss 0.23, style loss 0.67, TV loss 1.03, 0.07 sec\n",
"epoch 450, content loss 0.23, style loss 0.67, TV loss 1.03, 0.07 sec\n"
]
}
],
"source": [
"image_shape = (150, 225)\n",
"net = net.to(device)\n",
"content_X, contents_Y = get_contents(image_shape, device)\n",
"style_X, styles_Y = get_styles(image_shape, device)\n",
"output = train(content_X, contents_Y, styles_Y, device, 0.01, 500, 200)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"d2l.plt.imshow(postprocess(output));"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"training on cuda\n",
"epoch 50, content loss 0.34, style loss 0.63, TV loss 0.79, 0.18 sec\n",
"epoch 100, content loss 0.30, style loss 0.50, TV loss 0.74, 0.18 sec\n",
"epoch 150, content loss 0.29, style loss 0.46, TV loss 0.72, 0.18 sec\n",
"epoch 200, content loss 0.28, style loss 0.43, TV loss 0.70, 0.18 sec\n",
"epoch 250, content loss 0.28, style loss 0.43, TV loss 0.69, 0.18 sec\n",
"epoch 300, content loss 0.27, style loss 0.42, TV loss 0.69, 0.18 sec\n",
"epoch 350, content loss 0.27, style loss 0.42, TV loss 0.69, 0.18 sec\n",
"epoch 400, content loss 0.27, style loss 0.42, TV loss 0.69, 0.18 sec\n",
"epoch 450, content loss 0.27, style loss 0.42, TV loss 0.69, 0.18 sec\n"
]
}
],
"source": [
"image_shape = (300, 450)\n",
"_, content_Y = get_contents(image_shape, device)\n",
"_, style_Y = get_styles(image_shape, device)\n",
"X = preprocess(postprocess(output), image_shape).to(device)\n",
"big_output = train(X, content_Y, style_Y, device, 0.01, 500, 200)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"d2l.set_figsize((7, 5))\n",
"d2l.plt.imshow(postprocess(big_output));"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}