vgg_preprocessing.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """Provides utilities to preprocess images.
  16. The preprocessing steps for VGG were introduced in the following technical
  17. report:
  18. Very Deep Convolutional Networks For Large-Scale Image Recognition
  19. Karen Simonyan and Andrew Zisserman
  20. arXiv technical report, 2015
  21. PDF: http://arxiv.org/pdf/1409.1556.pdf
  22. ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
  23. CC-BY-4.0
  24. More information can be obtained from the VGG website:
  25. www.robots.ox.ac.uk/~vgg/research/very_deep/
  26. """
  27. from __future__ import absolute_import
  28. from __future__ import division
  29. from __future__ import print_function
  30. import tensorflow as tf
  31. from tensorflow.python.ops import control_flow_ops
  32. slim = tf.contrib.slim
  33. _R_MEAN = 123.68
  34. _G_MEAN = 116.78
  35. _B_MEAN = 103.94
  36. _RESIZE_SIDE_MIN = 256
  37. _RESIZE_SIDE_MAX = 512
  38. def _crop(image, offset_height, offset_width, crop_height, crop_width):
  39. """Crops the given image using the provided offsets and sizes.
  40. Note that the method doesn't assume we know the input image size but it does
  41. assume we know the input image rank.
  42. Args:
  43. image: an image of shape [height, width, channels].
  44. offset_height: a scalar tensor indicating the height offset.
  45. offset_width: a scalar tensor indicating the width offset.
  46. crop_height: the height of the cropped image.
  47. crop_width: the width of the cropped image.
  48. Returns:
  49. the cropped (and resized) image.
  50. Raises:
  51. InvalidArgumentError: if the rank is not 3 or if the image dimensions are
  52. less than the crop size.
  53. """
  54. original_shape = tf.shape(image)
  55. rank_assertion = tf.Assert(
  56. tf.equal(tf.rank(image), 3),
  57. ['Rank of image must be equal to 3.'])
  58. cropped_shape = control_flow_ops.with_dependencies(
  59. [rank_assertion],
  60. tf.stack([crop_height, crop_width, original_shape[2]]))
  61. size_assertion = tf.Assert(
  62. tf.logical_and(
  63. tf.greater_equal(original_shape[0], crop_height),
  64. tf.greater_equal(original_shape[1], crop_width)),
  65. ['Crop size greater than the image size.'])
  66. offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0]))
  67. # Use tf.slice instead of crop_to_bounding box as it accepts tensors to
  68. # define the crop size.
  69. image = control_flow_ops.with_dependencies(
  70. [size_assertion],
  71. tf.slice(image, offsets, cropped_shape))
  72. return tf.reshape(image, cropped_shape)
  73. def _random_crop(image_list, crop_height, crop_width):
  74. """Crops the given list of images.
  75. The function applies the same crop to each image in the list. This can be
  76. effectively applied when there are multiple image inputs of the same
  77. dimension such as:
  78. image, depths, normals = _random_crop([image, depths, normals], 120, 150)
  79. Args:
  80. image_list: a list of image tensors of the same dimension but possibly
  81. varying channel.
  82. crop_height: the new height.
  83. crop_width: the new width.
  84. Returns:
  85. the image_list with cropped images.
  86. Raises:
  87. ValueError: if there are multiple image inputs provided with different size
  88. or the images are smaller than the crop dimensions.
  89. """
  90. if not image_list:
  91. raise ValueError('Empty image_list.')
  92. # Compute the rank assertions.
  93. rank_assertions = []
  94. for i in range(len(image_list)):
  95. image_rank = tf.rank(image_list[i])
  96. rank_assert = tf.Assert(
  97. tf.equal(image_rank, 3),
  98. ['Wrong rank for tensor %s [expected] [actual]',
  99. image_list[i].name, 3, image_rank])
  100. rank_assertions.append(rank_assert)
  101. image_shape = control_flow_ops.with_dependencies(
  102. [rank_assertions[0]],
  103. tf.shape(image_list[0]))
  104. image_height = image_shape[0]
  105. image_width = image_shape[1]
  106. crop_size_assert = tf.Assert(
  107. tf.logical_and(
  108. tf.greater_equal(image_height, crop_height),
  109. tf.greater_equal(image_width, crop_width)),
  110. ['Crop size greater than the image size.'])
  111. asserts = [rank_assertions[0], crop_size_assert]
  112. for i in range(1, len(image_list)):
  113. image = image_list[i]
  114. asserts.append(rank_assertions[i])
  115. shape = control_flow_ops.with_dependencies([rank_assertions[i]],
  116. tf.shape(image))
  117. height = shape[0]
  118. width = shape[1]
  119. height_assert = tf.Assert(
  120. tf.equal(height, image_height),
  121. ['Wrong height for tensor %s [expected][actual]',
  122. image.name, height, image_height])
  123. width_assert = tf.Assert(
  124. tf.equal(width, image_width),
  125. ['Wrong width for tensor %s [expected][actual]',
  126. image.name, width, image_width])
  127. asserts.extend([height_assert, width_assert])
  128. # Create a random bounding box.
  129. #
  130. # Use tf.random_uniform and not numpy.random.rand as doing the former would
  131. # generate random numbers at graph eval time, unlike the latter which
  132. # generates random numbers at graph definition time.
  133. max_offset_height = control_flow_ops.with_dependencies(
  134. asserts, tf.reshape(image_height - crop_height + 1, []))
  135. max_offset_width = control_flow_ops.with_dependencies(
  136. asserts, tf.reshape(image_width - crop_width + 1, []))
  137. offset_height = tf.random_uniform(
  138. [], maxval=max_offset_height, dtype=tf.int32)
  139. offset_width = tf.random_uniform(
  140. [], maxval=max_offset_width, dtype=tf.int32)
  141. return [_crop(image, offset_height, offset_width,
  142. crop_height, crop_width) for image in image_list]
  143. def _central_crop(image_list, crop_height, crop_width):
  144. """Performs central crops of the given image list.
  145. Args:
  146. image_list: a list of image tensors of the same dimension but possibly
  147. varying channel.
  148. crop_height: the height of the image following the crop.
  149. crop_width: the width of the image following the crop.
  150. Returns:
  151. the list of cropped images.
  152. """
  153. outputs = []
  154. for image in image_list:
  155. image_height = tf.shape(image)[0]
  156. image_width = tf.shape(image)[1]
  157. offset_height = (image_height - crop_height) / 2
  158. offset_width = (image_width - crop_width) / 2
  159. outputs.append(_crop(image, offset_height, offset_width,
  160. crop_height, crop_width))
  161. return outputs
  162. def _mean_image_subtraction(image, means):
  163. """Subtracts the given means from each image channel.
  164. For example:
  165. means = [123.68, 116.779, 103.939]
  166. image = _mean_image_subtraction(image, means)
  167. Note that the rank of `image` must be known.
  168. Args:
  169. image: a tensor of size [height, width, C].
  170. means: a C-vector of values to subtract from each channel.
  171. Returns:
  172. the centered image.
  173. Raises:
  174. ValueError: If the rank of `image` is unknown, if `image` has a rank other
  175. than three or if the number of channels in `image` doesn't match the
  176. number of values in `means`.
  177. """
  178. if image.get_shape().ndims != 3:
  179. raise ValueError('Input must be of size [height, width, C>0]')
  180. num_channels = image.get_shape().as_list()[-1]
  181. if len(means) != num_channels:
  182. raise ValueError('len(means) must match the number of channels')
  183. channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
  184. for i in range(num_channels):
  185. channels[i] -= means[i]
  186. return tf.concat(axis=2, values=channels)
  187. def _smallest_size_at_least(height, width, smallest_side):
  188. """Computes new shape with the smallest side equal to `smallest_side`.
  189. Computes new shape with the smallest side equal to `smallest_side` while
  190. preserving the original aspect ratio.
  191. Args:
  192. height: an int32 scalar tensor indicating the current height.
  193. width: an int32 scalar tensor indicating the current width.
  194. smallest_side: A python integer or scalar `Tensor` indicating the size of
  195. the smallest side after resize.
  196. Returns:
  197. new_height: an int32 scalar tensor indicating the new height.
  198. new_width: and int32 scalar tensor indicating the new width.
  199. """
  200. smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
  201. height = tf.to_float(height)
  202. width = tf.to_float(width)
  203. smallest_side = tf.to_float(smallest_side)
  204. scale = tf.cond(tf.greater(height, width),
  205. lambda: smallest_side / width,
  206. lambda: smallest_side / height)
  207. new_height = tf.to_int32(height * scale)
  208. new_width = tf.to_int32(width * scale)
  209. return new_height, new_width
  210. def _aspect_preserving_resize(image, smallest_side):
  211. """Resize images preserving the original aspect ratio.
  212. Args:
  213. image: A 3-D image `Tensor`.
  214. smallest_side: A python integer or scalar `Tensor` indicating the size of
  215. the smallest side after resize.
  216. Returns:
  217. resized_image: A 3-D tensor containing the resized image.
  218. """
  219. smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32)
  220. shape = tf.shape(image)
  221. height = shape[0]
  222. width = shape[1]
  223. new_height, new_width = _smallest_size_at_least(height, width, smallest_side)
  224. image = tf.expand_dims(image, 0)
  225. resized_image = tf.image.resize_bilinear(image, [new_height, new_width],
  226. align_corners=False)
  227. resized_image = tf.squeeze(resized_image)
  228. resized_image.set_shape([None, None, 3])
  229. return resized_image
  230. def preprocess_for_train(image,
  231. output_height,
  232. output_width,
  233. resize_side_min=_RESIZE_SIDE_MIN,
  234. resize_side_max=_RESIZE_SIDE_MAX):
  235. """Preprocesses the given image for training.
  236. Note that the actual resizing scale is sampled from
  237. [`resize_size_min`, `resize_size_max`].
  238. Args:
  239. image: A `Tensor` representing an image of arbitrary size.
  240. output_height: The height of the image after preprocessing.
  241. output_width: The width of the image after preprocessing.
  242. resize_side_min: The lower bound for the smallest side of the image for
  243. aspect-preserving resizing.
  244. resize_side_max: The upper bound for the smallest side of the image for
  245. aspect-preserving resizing.
  246. Returns:
  247. A preprocessed image.
  248. """
  249. resize_side = tf.random_uniform(
  250. [], minval=resize_side_min, maxval=resize_side_max+1, dtype=tf.int32)
  251. image = _aspect_preserving_resize(image, resize_side)
  252. image = _random_crop([image], output_height, output_width)[0]
  253. image.set_shape([output_height, output_width, 3])
  254. image = tf.to_float(image)
  255. image = tf.image.random_flip_left_right(image)
  256. return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
  257. def preprocess_for_eval(image, output_height, output_width, resize_side):
  258. """Preprocesses the given image for evaluation.
  259. Args:
  260. image: A `Tensor` representing an image of arbitrary size.
  261. output_height: The height of the image after preprocessing.
  262. output_width: The width of the image after preprocessing.
  263. resize_side: The smallest side of the image for aspect-preserving resizing.
  264. Returns:
  265. A preprocessed image.
  266. """
  267. image = _aspect_preserving_resize(image, resize_side)
  268. image = _central_crop([image], output_height, output_width)[0]
  269. image.set_shape([output_height, output_width, 3])
  270. image = tf.to_float(image)
  271. return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN])
  272. def preprocess_image(image, output_height, output_width, is_training=False,
  273. resize_side_min=_RESIZE_SIDE_MIN,
  274. resize_side_max=_RESIZE_SIDE_MAX):
  275. """Preprocesses the given image.
  276. Args:
  277. image: A `Tensor` representing an image of arbitrary size.
  278. output_height: The height of the image after preprocessing.
  279. output_width: The width of the image after preprocessing.
  280. is_training: `True` if we're preprocessing the image for training and
  281. `False` otherwise.
  282. resize_side_min: The lower bound for the smallest side of the image for
  283. aspect-preserving resizing. If `is_training` is `False`, then this value
  284. is used for rescaling.
  285. resize_side_max: The upper bound for the smallest side of the image for
  286. aspect-preserving resizing. If `is_training` is `False`, this value is
  287. ignored. Otherwise, the resize side is sampled from
  288. [resize_size_min, resize_size_max].
  289. Returns:
  290. A preprocessed image.
  291. """
  292. if is_training:
  293. return preprocess_for_train(image, output_height, output_width,
  294. resize_side_min, resize_side_max)
  295. else:
  296. return preprocess_for_eval(image, output_height, output_width,
  297. resize_side_min)