123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480 |
- # Copyright 2016 Google Inc. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- """Read and preprocess image data.
- Image processing occurs on a single image at a time. Image are read and
- preprocessed in pararllel across mulitple threads. The resulting images
- are concatenated together to form a single batch for training or evaluation.
- -- Provide processed image data for a network:
- inputs: Construct batches of evaluation examples of images.
- distorted_inputs: Construct batches of training examples of images.
- batch_inputs: Construct batches of training or evaluation examples of images.
- -- Data processing:
- parse_example_proto: Parses an Example proto containing a training example
- of an image.
- -- Image decoding:
- decode_jpeg: Decode a JPEG encoded string into a 3-D float32 Tensor.
- -- Image preprocessing:
- image_preprocessing: Decode and preprocess one image for evaluation or training
- distort_image: Distort one image for training a network.
- eval_image: Prepare one image for evaluation.
- distort_color: Distort the color in one image for training.
- """
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
- import tensorflow as tf
- FLAGS = tf.app.flags.FLAGS
- tf.app.flags.DEFINE_integer('batch_size', 32,
- """Number of images to process in a batch.""")
- tf.app.flags.DEFINE_integer('image_size', 299,
- """Provide square images of this size.""")
- tf.app.flags.DEFINE_integer('num_preprocess_threads', 4,
- """Number of preprocessing threads per tower. """
- """Please make this a multiple of 4.""")
- # Images are preprocessed asynchronously using multiple threads specifed by
- # --num_preprocss_threads and the resulting processed images are stored in a
- # random shuffling queue. The shuffling queue dequeues --batch_size images
- # for processing on a given Inception tower. A larger shuffling queue guarantees
- # better mixing across examples within a batch and results in slightly higher
- # predictive performance in a trained model. Empirically,
- # --input_queue_memory_factor=16 works well. A value of 16 implies a queue size
- # of 1024*16 images. Assuming RGB 299x299 images, this implies a queue size of
- # 16GB. If the machine is memory limited, then decrease this factor to
- # decrease the CPU memory footprint, accordingly.
- tf.app.flags.DEFINE_integer('input_queue_memory_factor', 16,
- """Size of the queue of preprocessed images. """
- """Default is ideal but try smaller values, e.g. """
- """4, 2 or 1, if host memory is constrained. See """
- """comments in code for more details.""")
- def inputs(dataset, batch_size=None, num_preprocess_threads=None):
- """Generate batches of ImageNet images for evaluation.
- Use this function as the inputs for evaluating a network.
- Note that some (minimal) image preprocessing occurs during evaluation
- including central cropping and resizing of the image to fit the network.
- Args:
- dataset: instance of Dataset class specifying the dataset.
- batch_size: integer, number of examples in batch
- num_preprocess_threads: integer, total number of preprocessing threads but
- None defaults to FLAGS.num_preprocess_threads.
- Returns:
- images: Images. 4D tensor of size [batch_size, FLAGS.image_size,
- image_size, 3].
- labels: 1-D integer Tensor of [FLAGS.batch_size].
- """
- if not batch_size:
- batch_size = FLAGS.batch_size
- # Force all input processing onto CPU in order to reserve the GPU for
- # the forward inference and back-propagation.
- with tf.device('/cpu:0'):
- images, labels = batch_inputs(
- dataset, batch_size, train=False,
- num_preprocess_threads=num_preprocess_threads)
- return images, labels
- def distorted_inputs(dataset, batch_size=None, num_preprocess_threads=None):
- """Generate batches of distorted versions of ImageNet images.
- Use this function as the inputs for training a network.
- Distorting images provides a useful technique for augmenting the data
- set during training in order to make the network invariant to aspects
- of the image that do not effect the label.
- Args:
- dataset: instance of Dataset class specifying the dataset.
- batch_size: integer, number of examples in batch
- num_preprocess_threads: integer, total number of preprocessing threads but
- None defaults to FLAGS.num_preprocess_threads.
- Returns:
- images: Images. 4D tensor of size [batch_size, FLAGS.image_size,
- FLAGS.image_size, 3].
- labels: 1-D integer Tensor of [batch_size].
- """
- if not batch_size:
- batch_size = FLAGS.batch_size
- # Force all input processing onto CPU in order to reserve the GPU for
- # the forward inference and back-propagation.
- with tf.device('/cpu:0'):
- images, labels = batch_inputs(
- dataset, batch_size, train=True,
- num_preprocess_threads=num_preprocess_threads)
- return images, labels
- def decode_jpeg(image_buffer, scope=None):
- """Decode a JPEG string into one 3-D float image Tensor.
- Args:
- image_buffer: scalar string Tensor.
- scope: Optional scope for op_scope.
- Returns:
- 3-D float Tensor with values ranging from [0, 1).
- """
- with tf.op_scope([image_buffer], scope, 'decode_jpeg'):
- # Decode the string as an RGB JPEG.
- # Note that the resulting image contains an unknown height and width
- # that is set dynamically by decode_jpeg. In other words, the height
- # and width of image is unknown at compile-time.
- image = tf.image.decode_jpeg(image_buffer, channels=3)
- # After this point, all image pixels reside in [0,1)
- # until the very end, when they're rescaled to (-1, 1). The various
- # adjust_* ops all require this range for dtype float.
- image = tf.image.convert_image_dtype(image, dtype=tf.float32)
- return image
- def distort_color(image, thread_id=0, scope=None):
- """Distort the color of the image.
- Each color distortion is non-commutative and thus ordering of the color ops
- matters. Ideally we would randomly permute the ordering of the color ops.
- Rather then adding that level of complication, we select a distinct ordering
- of color ops for each preprocessing thread.
- Args:
- image: Tensor containing single image.
- thread_id: preprocessing thread ID.
- scope: Optional scope for op_scope.
- Returns:
- color-distorted image
- """
- with tf.op_scope([image], scope, 'distort_color'):
- color_ordering = thread_id % 2
- if color_ordering == 0:
- image = tf.image.random_brightness(image, max_delta=32. / 255.)
- image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
- image = tf.image.random_hue(image, max_delta=0.2)
- image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
- elif color_ordering == 1:
- image = tf.image.random_brightness(image, max_delta=32. / 255.)
- image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
- image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
- image = tf.image.random_hue(image, max_delta=0.2)
- # The random_* ops do not necessarily clamp.
- image = tf.clip_by_value(image, 0.0, 1.0)
- return image
- def distort_image(image, height, width, bbox, thread_id=0, scope=None):
- """Distort one image for training a network.
- Distorting images provides a useful technique for augmenting the data
- set during training in order to make the network invariant to aspects
- of the image that do not effect the label.
- Args:
- image: 3-D float Tensor of image
- height: integer
- width: integer
- bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
- where each coordinate is [0, 1) and the coordinates are arranged
- as [ymin, xmin, ymax, xmax].
- thread_id: integer indicating the preprocessing thread.
- scope: Optional scope for op_scope.
- Returns:
- 3-D float Tensor of distorted image used for training.
- """
- with tf.op_scope([image, height, width, bbox], scope, 'distort_image'):
- # Each bounding box has shape [1, num_boxes, box coords] and
- # the coordinates are ordered [ymin, xmin, ymax, xmax].
- # Display the bounding box in the first thread only.
- if not thread_id:
- image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
- bbox)
- tf.image_summary('image_with_bounding_boxes', image_with_box)
- # A large fraction of image datasets contain a human-annotated bounding
- # box delineating the region of the image containing the object of interest.
- # We choose to create a new bounding box for the object which is a randomly
- # distorted version of the human-annotated bounding box that obeys an allowed
- # range of aspect ratios, sizes and overlap with the human-annotated
- # bounding box. If no box is supplied, then we assume the bounding box is
- # the entire image.
- sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
- tf.shape(image),
- bounding_boxes=bbox,
- min_object_covered=0.1,
- aspect_ratio_range=[0.75, 1.33],
- area_range=[0.05, 1.0],
- max_attempts=100,
- use_image_if_no_bounding_boxes=True)
- bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
- if not thread_id:
- image_with_distorted_box = tf.image.draw_bounding_boxes(
- tf.expand_dims(image, 0), distort_bbox)
- tf.image_summary('images_with_distorted_bounding_box',
- image_with_distorted_box)
- # Crop the image to the specified bounding box.
- distorted_image = tf.slice(image, bbox_begin, bbox_size)
- # This resizing operation may distort the images because the aspect
- # ratio is not respected. We select a resize method in a round robin
- # fashion based on the thread number.
- # Note that ResizeMethod contains 4 enumerated resizing methods.
- resize_method = thread_id % 4
- distorted_image = tf.image.resize_images(distorted_image, height, width,
- resize_method)
- # Restore the shape since the dynamic slice based upon the bbox_size loses
- # the third dimension.
- distorted_image.set_shape([height, width, 3])
- if not thread_id:
- tf.image_summary('cropped_resized_image',
- tf.expand_dims(distorted_image, 0))
- # Randomly flip the image horizontally.
- distorted_image = tf.image.random_flip_left_right(distorted_image)
- # Randomly distort the colors.
- distorted_image = distort_color(distorted_image, thread_id)
- if not thread_id:
- tf.image_summary('final_distorted_image',
- tf.expand_dims(distorted_image, 0))
- return distorted_image
- def eval_image(image, height, width, scope=None):
- """Prepare one image for evaluation.
- Args:
- image: 3-D float Tensor
- height: integer
- width: integer
- scope: Optional scope for op_scope.
- Returns:
- 3-D float Tensor of prepared image.
- """
- with tf.op_scope([image, height, width], scope, 'eval_image'):
- # Crop the central region of the image with an area containing 87.5% of
- # the original image.
- image = tf.image.central_crop(image, central_fraction=0.875)
- # Resize the image to the original height and width.
- image = tf.expand_dims(image, 0)
- image = tf.image.resize_bilinear(image, [height, width],
- align_corners=False)
- image = tf.squeeze(image, [0])
- return image
- def image_preprocessing(image_buffer, bbox, train, thread_id=0):
- """Decode and preprocess one image for evaluation or training.
- Args:
- image_buffer: JPEG encoded string Tensor
- bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
- where each coordinate is [0, 1) and the coordinates are arranged as
- [ymin, xmin, ymax, xmax].
- train: boolean
- thread_id: integer indicating preprocessing thread
- Returns:
- 3-D float Tensor containing an appropriately scaled image
- Raises:
- ValueError: if user does not provide bounding box
- """
- if bbox is None:
- raise ValueError('Please supply a bounding box.')
- image = decode_jpeg(image_buffer)
- height = FLAGS.image_size
- width = FLAGS.image_size
- if train:
- image = distort_image(image, height, width, bbox, thread_id)
- else:
- image = eval_image(image, height, width)
- # Finally, rescale to [-1,1] instead of [0, 1)
- image = tf.sub(image, 0.5)
- image = tf.mul(image, 2.0)
- return image
- def parse_example_proto(example_serialized):
- """Parses an Example proto containing a training example of an image.
- The output of the build_image_data.py image preprocessing script is a dataset
- containing serialized Example protocol buffers. Each Example proto contains
- the following fields:
- image/height: 462
- image/width: 581
- image/colorspace: 'RGB'
- image/channels: 3
- image/class/label: 615
- image/class/synset: 'n03623198'
- image/class/text: 'knee pad'
- image/object/bbox/xmin: 0.1
- image/object/bbox/xmax: 0.9
- image/object/bbox/ymin: 0.2
- image/object/bbox/ymax: 0.6
- image/object/bbox/label: 615
- image/format: 'JPEG'
- image/filename: 'ILSVRC2012_val_00041207.JPEG'
- image/encoded: <JPEG encoded string>
- Args:
- example_serialized: scalar Tensor tf.string containing a serialized
- Example protocol buffer.
- Returns:
- image_buffer: Tensor tf.string containing the contents of a JPEG file.
- label: Tensor tf.int32 containing the label.
- bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
- where each coordinate is [0, 1) and the coordinates are arranged as
- [ymin, xmin, ymax, xmax].
- text: Tensor tf.string containing the human-readable label.
- """
- # Dense features in Example proto.
- feature_map = {
- 'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
- default_value=''),
- 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64,
- default_value=-1),
- 'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
- default_value=''),
- }
- sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
- # Sparse features in Example proto.
- feature_map.update(
- {k: sparse_float32 for k in ['image/object/bbox/xmin',
- 'image/object/bbox/ymin',
- 'image/object/bbox/xmax',
- 'image/object/bbox/ymax']})
- features = tf.parse_single_example(example_serialized, feature_map)
- label = tf.cast(features['image/class/label'], dtype=tf.int32)
- xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
- ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
- xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
- ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
- # Note that we impose an ordering of (y, x) just to make life difficult.
- bbox = tf.concat(0, [ymin, xmin, ymax, xmax])
- # Force the variable number of bounding boxes into the shape
- # [1, num_boxes, coords].
- bbox = tf.expand_dims(bbox, 0)
- bbox = tf.transpose(bbox, [0, 2, 1])
- return features['image/encoded'], label, bbox, features['image/class/text']
- def batch_inputs(dataset, batch_size, train, num_preprocess_threads=None):
- """Contruct batches of training or evaluation examples from the image dataset.
- Args:
- dataset: instance of Dataset class specifying the dataset.
- See dataset.py for details.
- batch_size: integer
- train: boolean
- num_preprocess_threads: integer, total number of preprocessing threads
- Returns:
- images: 4-D float Tensor of a batch of images
- labels: 1-D integer Tensor of [batch_size].
- Raises:
- ValueError: if data is not found
- """
- with tf.name_scope('batch_processing'):
- data_files = dataset.data_files()
- if data_files is None:
- raise ValueError('No data files found for this dataset')
- filename_queue = tf.train.string_input_producer(data_files, capacity=16)
- if num_preprocess_threads is None:
- num_preprocess_threads = FLAGS.num_preprocess_threads
- if num_preprocess_threads % 4:
- raise ValueError('Please make num_preprocess_threads a multiple '
- 'of 4 (%d % 4 != 0).', num_preprocess_threads)
- # Create a subgraph with its own reader (but sharing the
- # filename_queue) for each preprocessing thread.
- images_and_labels = []
- for thread_id in range(num_preprocess_threads):
- reader = dataset.reader()
- _, example_serialized = reader.read(filename_queue)
- # Parse a serialized Example proto to extract the image and metadata.
- image_buffer, label_index, bbox, _ = parse_example_proto(
- example_serialized)
- image = image_preprocessing(image_buffer, bbox, train, thread_id)
- images_and_labels.append([image, label_index])
- # Approximate number of examples per shard.
- examples_per_shard = 1024
- # Size the random shuffle queue to balance between good global
- # mixing (more examples) and memory use (fewer examples).
- # 1 image uses 299*299*3*4 bytes = 1MB
- # The default input_queue_memory_factor is 16 implying a shuffling queue
- # size: examples_per_shard * 16 * 1MB = 17.6GB
- min_queue_examples = examples_per_shard * FLAGS.input_queue_memory_factor
- # Create a queue that produces the examples in batches after shuffling.
- if train:
- images, label_index_batch = tf.train.shuffle_batch_join(
- images_and_labels,
- batch_size=batch_size,
- capacity=min_queue_examples + 3 * batch_size,
- min_after_dequeue=min_queue_examples)
- else:
- images, label_index_batch = tf.train.batch_join(
- images_and_labels,
- batch_size=batch_size,
- capacity=min_queue_examples + 3 * batch_size)
- # Reshape images into these desired dimensions.
- height = FLAGS.image_size
- width = FLAGS.image_size
- depth = 3
- images = tf.cast(images, tf.float32)
- images = tf.reshape(images, shape=[batch_size, height, width, depth])
- # Display the training images in the visualizer.
- tf.image_summary('images', images)
- return images, tf.reshape(label_index_batch, [batch_size])
|