123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- #!/bin/bash
- # Copyright 2016 Google Inc. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ==============================================================================
- # Script to download and preprocess ImageNet Challenge 2012
- # training and validation data set.
- #
- # The final output of this script are sharded TFRecord files containing
- # serialized Example protocol buffers. See build_imagenet_data.py for
- # details of how the Example protocol buffers contain the ImageNet data.
- #
- # The final output of this script appears as such:
- #
- # data_dir/train-00000-of-01024
- # data_dir/train-00001-of-01024
- # ...
- # data_dir/train-00127-of-01024
- #
- # and
- #
- # data_dir/validation-00000-of-00128
- # data_dir/validation-00001-of-00128
- # ...
- # data_dir/validation-00127-of-00128
- #
- # Note that this script may take several hours to run to completion. The
- # conversion of the ImageNet data to TFRecords alone takes 2-3 hours depending
- # on the speed of your machine. Please be patient.
- #
- # **IMPORTANT**
- # To download the raw images, the user must create an account with image-net.org
- # and generate a username and access_key. The latter two are required for
- # downloading the raw images.
- #
- # usage:
- # ./download_and_preprocess_imagenet.sh [data-dir]
- set -e
- if [ -z "$1" ]; then
- echo "usage download_and_preprocess_imagenet.sh [data dir]"
- exit
- fi
- # Create the output and temporary directories.
- DATA_DIR="${1%/}"
- SCRATCH_DIR="${DATA_DIR}/raw-data/"
- mkdir -p "${DATA_DIR}"
- mkdir -p "${SCRATCH_DIR}"
- WORK_DIR="$0.runfiles/inception"
- # Download the ImageNet data.
- LABELS_FILE="${WORK_DIR}/data/imagenet_lsvrc_2015_synsets.txt"
- DOWNLOAD_SCRIPT="${WORK_DIR}/data/download_imagenet.sh"
- "${DOWNLOAD_SCRIPT}" "${SCRATCH_DIR}" "${LABELS_FILE}"
- # Note the locations of the train and validation data.
- TRAIN_DIRECTORY="${SCRATCH_DIR}train/"
- VALIDATION_DIRECTORY="${SCRATCH_DIR}validation/"
- # Preprocess the validation data by moving the images into the appropriate
- # sub-directory based on the label (synset) of the image.
- echo "Organizing the validation data into sub-directories."
- PREPROCESS_VAL_SCRIPT="${WORK_DIR}/data/preprocess_imagenet_validation_data.py"
- VAL_LABELS_FILE="${WORK_DIR}/data/imagenet_2012_validation_synset_labels.txt"
- "${PREPROCESS_VAL_SCRIPT}" "${VALIDATION_DIRECTORY}" "${VAL_LABELS_FILE}"
- # Convert the XML files for bounding box annotations into a single CSV.
- echo "Extracting bounding box information from XML."
- BOUNDING_BOX_SCRIPT="${WORK_DIR}/data/process_bounding_boxes.py"
- BOUNDING_BOX_FILE="${SCRATCH_DIR}/imagenet_2012_bounding_boxes.csv"
- BOUNDING_BOX_DIR="${SCRATCH_DIR}bounding_boxes/"
- "${BOUNDING_BOX_SCRIPT}" "${BOUNDING_BOX_DIR}" "${LABELS_FILE}" \
- | sort >"${BOUNDING_BOX_FILE}"
- echo "Finished downloading and preprocessing the ImageNet data."
- # Build the TFRecords version of the ImageNet data.
- BUILD_SCRIPT="${WORK_DIR}/build_imagenet_data"
- OUTPUT_DIRECTORY="${DATA_DIR}"
- IMAGENET_METADATA_FILE="${WORK_DIR}/data/imagenet_metadata.txt"
- "${BUILD_SCRIPT}" \
- --train_directory="${TRAIN_DIRECTORY}" \
- --validation_directory="${VALIDATION_DIRECTORY}" \
- --output_directory="${OUTPUT_DIRECTORY}" \
- --imagenet_metadata_file="${IMAGENET_METADATA_FILE}" \
- --labels_file="${LABELS_FILE}" \
- --bounding_box_file="${BOUNDING_BOX_FILE}"
|