model.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. # Copyright 2016 The TensorFlow Authors All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """Cross Convolutional Model.
  16. https://arxiv.org/pdf/1607.02586v1.pdf
  17. """
  18. import math
  19. import sys
  20. import tensorflow as tf
  21. slim = tf.contrib.slim
  22. class CrossConvModel(object):
  23. def __init__(self, image_diff_list, params):
  24. """Constructor.
  25. Args:
  26. image_diff_list: A list of (image, diff) tuples, with shape
  27. [batch_size, image_size, image_size, 3] and image_sizes as
  28. [32, 64, 128, 256].
  29. params: Dict of parameters.
  30. """
  31. self.images = [i for (i, _) in image_diff_list]
  32. # Move the diff to the positive realm.
  33. self.diffs = [(d + params['scale']) / 2 for (i, d) in image_diff_list]
  34. self.params = params
  35. def Build(self):
  36. with tf.device('/gpu:0'):
  37. with slim.arg_scope([slim.conv2d],
  38. activation_fn=tf.nn.relu,
  39. normalizer_fn=slim.batch_norm,
  40. normalizer_params={'is_training':
  41. self.params['is_training']}):
  42. self._BuildMotionKernel()
  43. encoded_images = self._BuildImageEncoder()
  44. cross_conved_images = self._CrossConv(encoded_images)
  45. self._BuildImageDecoder(cross_conved_images)
  46. self._BuildLoss()
  47. image = self.images[1]
  48. diff = self.diffs[1]
  49. self.global_step = tf.Variable(0, name='global_step', trainable=False)
  50. if self.params['is_training']:
  51. self._BuildTrainOp()
  52. diff = diff * 2.0 - self.params['scale']
  53. diff_output = self.diff_output * 2.0 - self.params['scale']
  54. concat_image = tf.concat(
  55. axis=1, values=[image, image + diff_output, image + diff, diff_output])
  56. tf.summary.image('origin_predict_expect_predictdiff', concat_image)
  57. self.summary_op = tf.summary.merge_all()
  58. return self.loss
  59. def _BuildTrainOp(self):
  60. lrn_rate = tf.maximum(
  61. 0.01, # min_lr_rate.
  62. tf.train.exponential_decay(
  63. self.params['learning_rate'], self.global_step, 10000, 0.5))
  64. tf.summary.scalar('learning rate', lrn_rate)
  65. optimizer = tf.train.GradientDescentOptimizer(lrn_rate)
  66. self.train_op = slim.learning.create_train_op(
  67. self.loss, optimizer, global_step=self.global_step)
  68. def _BuildLoss(self):
  69. # 1. reconstr_loss seems doesn't do better than l2 loss.
  70. # 2. Only works when using reduce_mean. reduce_sum doesn't work.
  71. # 3. It seems kl loss doesn't play an important role.
  72. self.loss = 0
  73. with tf.variable_scope('loss'):
  74. if self.params['l2_loss']:
  75. l2_loss = tf.reduce_mean(tf.square(self.diff_output - self.diffs[1]))
  76. tf.summary.scalar('l2_loss', l2_loss)
  77. self.loss += l2_loss
  78. if self.params['reconstr_loss']:
  79. reconstr_loss = (-tf.reduce_mean(
  80. self.diffs[1] * (1e-10 + self.diff_output) +
  81. (1-self.diffs[1]) * tf.log(1e-10 + 1 - self.diff_output)))
  82. reconstr_loss = tf.check_numerics(reconstr_loss, 'reconstr_loss')
  83. tf.summary.scalar('reconstr_loss', reconstr_loss)
  84. self.loss += reconstr_loss
  85. if self.params['kl_loss']:
  86. kl_loss = (0.5 * tf.reduce_mean(
  87. tf.square(self.z_mean) + tf.square(self.z_stddev) -
  88. 2 * self.z_stddev_log - 1))
  89. tf.summary.scalar('kl_loss', kl_loss)
  90. self.loss += kl_loss
  91. tf.summary.scalar('loss', self.loss)
  92. def _BuildMotionKernel(self):
  93. image = self.images[-2]
  94. diff = self.diffs[-2]
  95. shape = image.get_shape().as_list()
  96. assert shape[1] == shape[2] and shape[1] == 128
  97. batch_size = shape[0]
  98. net = tf.concat(axis=3, values=[image, diff])
  99. with tf.variable_scope('motion_encoder'):
  100. with slim.arg_scope([slim.conv2d], padding='VALID'):
  101. net = slim.conv2d(net, 96, [5, 5], stride=1)
  102. net = slim.max_pool2d(net, [2, 2])
  103. net = slim.conv2d(net, 96, [5, 5], stride=1)
  104. net = slim.max_pool2d(net, [2, 2])
  105. net = slim.conv2d(net, 128, [5, 5], stride=1)
  106. net = slim.conv2d(net, 128, [5, 5], stride=1)
  107. net = slim.max_pool2d(net, [2, 2])
  108. net = slim.conv2d(net, 256, [4, 4], stride=1)
  109. net = slim.conv2d(net, 256, [3, 3], stride=1)
  110. z = tf.reshape(net, shape=[batch_size, -1])
  111. self.z_mean, self.z_stddev_log = tf.split(
  112. axis=1, num_or_size_splits=2, value=z)
  113. self.z_stddev = tf.exp(self.z_stddev_log)
  114. epsilon = tf.random_normal(
  115. self.z_mean.get_shape().as_list(), 0, 1, dtype=tf.float32)
  116. kernel = self.z_mean + tf.multiply(self.z_stddev, epsilon)
  117. width = int(math.sqrt(kernel.get_shape().as_list()[1] // 128))
  118. kernel = tf.reshape(kernel, [batch_size, width, width, 128])
  119. with tf.variable_scope('kernel_decoder'):
  120. with slim.arg_scope([slim.conv2d], padding='SAME'):
  121. kernel = slim.conv2d(kernel, 128, [5, 5], stride=1)
  122. self.kernel = slim.conv2d(kernel, 128, [5, 5], stride=1)
  123. sys.stderr.write('kernel shape: %s\n' % kernel.get_shape())
  124. def _BuildImageEncoder(self):
  125. feature_maps = []
  126. for (i, image) in enumerate(self.images):
  127. with tf.variable_scope('image_encoder_%d' % i):
  128. with slim.arg_scope([slim.conv2d, slim.max_pool2d], padding='SAME'):
  129. net = slim.conv2d(image, 64, [5, 5], stride=1)
  130. net = slim.conv2d(net, 64, [5, 5], stride=1)
  131. net = slim.max_pool2d(net, [5, 5])
  132. net = slim.conv2d(net, 64, [5, 5], stride=1)
  133. net = slim.conv2d(net, 32, [5, 5], stride=1)
  134. net = slim.max_pool2d(net, [2, 2])
  135. sys.stderr.write('image_conv shape: %s\n' % net.get_shape())
  136. feature_maps.append(net)
  137. return feature_maps
  138. def _CrossConvHelper(self, encoded_image, kernel):
  139. """Cross Convolution.
  140. The encoded image and kernel are of the same shape. Namely
  141. [batch_size, image_size, image_size, channels]. They are split
  142. into [image_size, image_size] image squares [kernel_size, kernel_size]
  143. kernel squares. kernel squares are used to convolute image squares.
  144. """
  145. images = tf.expand_dims(encoded_image, 0)
  146. kernels = tf.expand_dims(kernel, 3)
  147. return tf.nn.depthwise_conv2d(images, kernels, [1, 1, 1, 1], 'SAME')
  148. def _CrossConv(self, encoded_images):
  149. """Apply the motion kernel on the encoded_images."""
  150. cross_conved_images = []
  151. kernels = tf.split(axis=3, num_or_size_splits=4, value=self.kernel)
  152. for (i, encoded_image) in enumerate(encoded_images):
  153. with tf.variable_scope('cross_conv_%d' % i):
  154. kernel = kernels[i]
  155. encoded_image = tf.unstack(encoded_image, axis=0)
  156. kernel = tf.unstack(kernel, axis=0)
  157. assert len(encoded_image) == len(kernel)
  158. assert len(encoded_image) == self.params['batch_size']
  159. conved_image = []
  160. for j in xrange(len(encoded_image)):
  161. conved_image.append(self._CrossConvHelper(
  162. encoded_image[j], kernel[j]))
  163. cross_conved_images.append(tf.concat(axis=0, values=conved_image))
  164. sys.stderr.write('cross_conved shape: %s\n' %
  165. cross_conved_images[-1].get_shape())
  166. return cross_conved_images
  167. def _Deconv(self, net, out_filters, kernel_size, stride):
  168. shape = net.get_shape().as_list()
  169. in_filters = shape[3]
  170. kernel_shape = [kernel_size, kernel_size, out_filters, in_filters]
  171. weights = tf.get_variable(
  172. name='weights',
  173. shape=kernel_shape,
  174. dtype=tf.float32,
  175. initializer=tf.truncated_normal_initializer(stddev=0.01))
  176. out_height = shape[1] * stride
  177. out_width = shape[2] * stride
  178. batch_size = shape[0]
  179. output_shape = [batch_size, out_height, out_width, out_filters]
  180. net = tf.nn.conv2d_transpose(net, weights, output_shape,
  181. [1, stride, stride, 1], padding='SAME')
  182. slim.batch_norm(net)
  183. return net
  184. def _BuildImageDecoder(self, cross_conved_images):
  185. """Decode the cross_conved feature maps into the predicted images."""
  186. nets = []
  187. for i, cross_conved_image in enumerate(cross_conved_images):
  188. with tf.variable_scope('image_decoder_%d' % i):
  189. stride = 64 / cross_conved_image.get_shape().as_list()[1]
  190. # TODO(xpan): Alternative solution for upsampling?
  191. nets.append(self._Deconv(
  192. cross_conved_image, 64, kernel_size=3, stride=stride))
  193. net = tf.concat(axis=3, values=nets)
  194. net = slim.conv2d(net, 128, [9, 9], padding='SAME', stride=1)
  195. net = slim.conv2d(net, 128, [1, 1], padding='SAME', stride=1)
  196. net = slim.conv2d(net, 3, [1, 1], padding='SAME', stride=1)
  197. self.diff_output = net
  198. sys.stderr.write('diff_output shape: %s\n' % self.diff_output.get_shape())