process_bounding_boxes.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. #!/usr/bin/python
  2. # Copyright 2016 Google Inc. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # ==============================================================================
  16. """Process the ImageNet Challenge bounding boxes for TensorFlow model training.
  17. This script is called as
  18. process_bounding_boxes.py <dir> [synsets-file]
  19. Where <dir> is a directory containing the downloaded and unpacked bounding box
  20. data. If [synsets-file] is supplied, then only the bounding boxes whose
  21. synstes are contained within this file are returned. Note that the
  22. [synsets-file] file contains synset ids, one per line.
  23. The script dumps out a CSV text file in which each line contains an entry.
  24. n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
  25. The entry can be read as:
  26. <JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
  27. The bounding box for <JPEG file name> contains two points (xmin, ymin) and
  28. (xmax, ymax) specifying the lower-left corner and upper-right corner of a
  29. bounding box in *relative* coordinates.
  30. The user supplies a directory where the XML files reside. The directory
  31. structure in the directory <dir> is assumed to look like this:
  32. <dir>/nXXXXXXXX/nXXXXXXXX_YYYY.xml
  33. Each XML file contains a bounding box annotation. The script:
  34. (1) Parses the XML file and extracts the filename, label and bounding box info.
  35. (2) The bounding box is specified in the XML files as integer (xmin, ymin) and
  36. (xmax, ymax) *relative* to image size displayed to the human annotator. The
  37. size of the image displayed to the human annotator is stored in the XML file
  38. as integer (height, width).
  39. Note that the displayed size will differ from the actual size of the image
  40. downloaded from image-net.org. To make the bounding box annotation useable,
  41. we convert bounding box to floating point numbers relative to displayed
  42. height and width of the image.
  43. Note that each XML file might contain N bounding box annotations.
  44. Note that the points are all clamped at a range of [0.0, 1.0] because some
  45. human annotations extend outside the range of the supplied image.
  46. See details here: http://image-net.org/download-bboxes
  47. (3) By default, the script outputs all valid bounding boxes. If a
  48. [synsets-file] is supplied, only the subset of bounding boxes associated
  49. with those synsets are outputted. Importantly, one can supply a list of
  50. synsets in the ImageNet Challenge and output the list of bounding boxes
  51. associated with the training images of the ILSVRC.
  52. We use these bounding boxes to inform the random distortion of images
  53. supplied to the network.
  54. If you run this script successfully, you will see the following output
  55. to stderr:
  56. > Finished processing 544546 XML files.
  57. > Skipped 0 XML files not in ImageNet Challenge.
  58. > Skipped 0 bounding boxes not in ImageNet Challenge.
  59. > Wrote 615299 bounding boxes from 544546 annotated images.
  60. """
  61. from __future__ import absolute_import
  62. from __future__ import division
  63. from __future__ import print_function
  64. import glob
  65. import os.path
  66. import sys
  67. import xml.etree.ElementTree as ET
  68. class BoundingBox(object):
  69. pass
  70. def GetItem(name, root, index=0):
  71. count = 0
  72. for item in root.iter(name):
  73. if count == index:
  74. return item.text
  75. count += 1
  76. # Failed to find "index" occurrence of item.
  77. return -1
  78. def GetInt(name, root, index=0):
  79. return int(GetItem(name, root, index))
  80. def FindNumberBoundingBoxes(root):
  81. index = 0
  82. while True:
  83. if GetInt('xmin', root, index) == -1:
  84. break
  85. index += 1
  86. return index
  87. def ProcessXMLAnnotation(xml_file):
  88. """Process a single XML file containing a bounding box."""
  89. # pylint: disable=broad-except
  90. try:
  91. tree = ET.parse(xml_file)
  92. except Exception:
  93. print('Failed to parse: ' + xml_file, file=sys.stderr)
  94. return None
  95. # pylint: enable=broad-except
  96. root = tree.getroot()
  97. num_boxes = FindNumberBoundingBoxes(root)
  98. boxes = []
  99. for index in xrange(num_boxes):
  100. box = BoundingBox()
  101. # Grab the 'index' annotation.
  102. box.xmin = GetInt('xmin', root, index)
  103. box.ymin = GetInt('ymin', root, index)
  104. box.xmax = GetInt('xmax', root, index)
  105. box.ymax = GetInt('ymax', root, index)
  106. box.width = GetInt('width', root)
  107. box.height = GetInt('height', root)
  108. box.filename = GetItem('filename', root) + '.JPEG'
  109. box.label = GetItem('name', root)
  110. xmin = float(box.xmin) / float(box.width)
  111. xmax = float(box.xmax) / float(box.width)
  112. ymin = float(box.ymin) / float(box.height)
  113. ymax = float(box.ymax) / float(box.height)
  114. # Some images contain bounding box annotations that
  115. # extend outside of the supplied image. See, e.g.
  116. # n03127925/n03127925_147.xml
  117. # Additionally, for some bounding boxes, the min > max
  118. # or the box is entirely outside of the image.
  119. min_x = min(xmin, xmax)
  120. max_x = max(xmin, xmax)
  121. box.xmin_scaled = min(max(min_x, 0.0), 1.0)
  122. box.xmax_scaled = min(max(max_x, 0.0), 1.0)
  123. min_y = min(ymin, ymax)
  124. max_y = max(ymin, ymax)
  125. box.ymin_scaled = min(max(min_y, 0.0), 1.0)
  126. box.ymax_scaled = min(max(max_y, 0.0), 1.0)
  127. boxes.append(box)
  128. return boxes
  129. if __name__ == '__main__':
  130. if len(sys.argv) < 2 or len(sys.argv) > 3:
  131. print('Invalid usage\n'
  132. 'usage: process_bounding_boxes.py <dir> [synsets-file]',
  133. file=sys.stderr)
  134. sys.exit(-1)
  135. xml_files = glob.glob(sys.argv[1] + '/*/*.xml')
  136. print('Identified %d XML files in %s' % (len(xml_files), sys.argv[1]),
  137. file=sys.stderr)
  138. if len(sys.argv) == 3:
  139. labels = set([l.strip() for l in open(sys.argv[2]).readlines()])
  140. print('Identified %d synset IDs in %s' % (len(labels), sys.argv[2]),
  141. file=sys.stderr)
  142. else:
  143. labels = None
  144. skipped_boxes = 0
  145. skipped_files = 0
  146. saved_boxes = 0
  147. saved_files = 0
  148. for file_index, one_file in enumerate(xml_files):
  149. # Example: <...>/n06470073/n00141669_6790.xml
  150. label = os.path.basename(os.path.dirname(one_file))
  151. # Determine if the annotation is from an ImageNet Challenge label.
  152. if labels is not None and label not in labels:
  153. skipped_files += 1
  154. continue
  155. bboxes = ProcessXMLAnnotation(one_file)
  156. assert bboxes is not None, 'No bounding boxes found in ' + one_file
  157. found_box = False
  158. for bbox in bboxes:
  159. if labels is not None:
  160. if bbox.label != label:
  161. # Note: There is a slight bug in the bounding box annotation data.
  162. # Many of the dog labels have the human label 'Scottish_deerhound'
  163. # instead of the synset ID 'n02092002' in the bbox.label field. As a
  164. # simple hack to overcome this issue, we only exclude bbox labels
  165. # *which are synset ID's* that do not match original synset label for
  166. # the XML file.
  167. if bbox.label in labels:
  168. skipped_boxes += 1
  169. continue
  170. # Guard against improperly specified boxes.
  171. if (bbox.xmin_scaled >= bbox.xmax_scaled or
  172. bbox.ymin_scaled >= bbox.ymax_scaled):
  173. skipped_boxes += 1
  174. continue
  175. # Note bbox.filename occasionally contains '%s' in the name. This is
  176. # data set noise that is fixed by just using the basename of the XML file.
  177. image_filename = os.path.splitext(os.path.basename(one_file))[0]
  178. print('%s.JPEG,%.4f,%.4f,%.4f,%.4f' %
  179. (image_filename,
  180. bbox.xmin_scaled, bbox.ymin_scaled,
  181. bbox.xmax_scaled, bbox.ymax_scaled))
  182. saved_boxes += 1
  183. found_box = True
  184. if found_box:
  185. saved_files += 1
  186. else:
  187. skipped_files += 1
  188. if not file_index % 5000:
  189. print('--> processed %d of %d XML files.' %
  190. (file_index + 1, len(xml_files)),
  191. file=sys.stderr)
  192. print('--> skipped %d boxes and %d XML files.' %
  193. (skipped_boxes, skipped_files), file=sys.stderr)
  194. print('Finished processing %d XML files.' % len(xml_files), file=sys.stderr)
  195. print('Skipped %d XML files not in ImageNet Challenge.' % skipped_files,
  196. file=sys.stderr)
  197. print('Skipped %d bounding boxes not in ImageNet Challenge.' % skipped_boxes,
  198. file=sys.stderr)
  199. print('Wrote %d bounding boxes from %d annotated images.' %
  200. (saved_boxes, saved_files),
  201. file=sys.stderr)
  202. print('Finished.', file=sys.stderr)