# Load and parse data with TensorFlow

A TensorFlow example to build input pipelines for loading data efficiently.


- Numpy Arrays
- Images
- CSV file
- Custom data from a Generator

For more information about creating and loading TensorFlow's `TFRecords` data format, see: [tfrecords.ipynb](tfrecords.ipynb)

- Author: Aymeric Damien
- Project: https://github.com/aymericdamien/TensorFlow-Examples/

In [None]:
from __future__ import absolute_import, division, print_function

import numpy as np
import random
import requests
import string
import tarfile
import tensorflow as tf

### Load Numpy Arrays

Build a data pipeline over numpy arrays.

In [None]:
# Create a toy dataset (even and odd numbers, with respective labels of 0 and 1).
evens = np.arange(0, 100, step=2, dtype=np.int32)
evens_label = np.zeros(50, dtype=np.int32)
odds = np.arange(1, 100, step=2, dtype=np.int32)
odds_label = np.ones(50, dtype=np.int32)
# Concatenate arrays
features = np.concatenate([evens, odds])
labels = np.concatenate([evens_label, odds_label])

In [None]:
with tf.Graph().as_default():
 # Create TF session.
 sess = tf.Session()
 
 # Slice the numpy arrays (each row becoming a record).
 data = tf.data.Dataset.from_tensor_slices((features, labels))
 # Refill data indefinitely. 
 data = data.repeat()
 # Shuffle data.
 data = data.shuffle(buffer_size=100)
 # Batch data (aggregate records together).
 data = data.batch(batch_size=4)
 # Prefetch batch (pre-load batch for faster consumption).
 data = data.prefetch(buffer_size=1)
 
 # Create an iterator over the dataset.
 iterator = data.make_initializable_iterator()
 # Initialize the iterator.
 sess.run(iterator.initializer)

 # Get next data batch.
 d = iterator.get_next()

In [None]:
# Display data.
for i in range(5):
 x, y = sess.run(d)
 print(x, y)

[82 58 80 23] [0 0 0 1]
[16 91 74 96] [0 1 0 0]
[ 4 17 32 34] [0 1 0 0]
[16 8 77 21] [0 0 1 1]
[20 99 48 18] [0 1 0 0]


### Load CSV files

Build a data pipeline from features stored in a CSV file. For this example, Titanic dataset will be used as a toy dataset stored in CSV format.

#### Titanic Dataset



survived|pclass|name|sex|age|sibsp|parch|ticket|fare
--------|------|----|---|---|-----|-----|------|----
1|1|"Allen, Miss. Elisabeth Walton"|female|29|0|0|24160|211.3375
1|1|"Allison, Master. Hudson Trevor"|male|0.9167|1|2|113781|151.5500
0|1|"Allison, Miss. Helen Loraine"|female|2|1|2|113781|151.5500
0|1|"Allison, Mr. Hudson Joshua Creighton"|male|30|1|2|113781|151.5500
...|...|...|...|...|...|...|...|...

In [None]:
# Download Titanic dataset (in csv format).
d = requests.get("https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/titanic_dataset.csv")
with open("titanic_dataset.csv", "wb") as f:
 f.write(d.content)

In [None]:
# Load Titanic dataset.
# Original features: survived,pclass,name,sex,age,sibsp,parch,ticket,fare
# Select specific columns: survived,pclass,name,sex,age,fare
column_to_use = [0, 1, 2, 3, 4, 8]
record_defaults = [tf.int32, tf.int32, tf.string, tf.string, tf.float32, tf.float32]

In [None]:
with tf.Graph().as_default():
 # Create TF session.
 sess = tf.Session()
 
 # Load the whole dataset file, and slice each line.
 data = tf.data.experimental.CsvDataset("titanic_dataset.csv", record_defaults, header=True, select_cols=column_to_use)
 # Refill data indefinitely. 
 data = data.repeat()
 # Shuffle data.
 data = data.shuffle(buffer_size=1000)
 # Batch data (aggregate records together).
 data = data.batch(batch_size=2)
 # Prefetch batch (pre-load batch for faster consumption).
 data = data.prefetch(buffer_size=1)
 
 # Create an iterator over the dataset.
 iterator = data.make_initializable_iterator()
 # Initialize the iterator.
 sess.run(iterator.initializer)

 # Get next data batch.
 d = iterator.get_next()

In [None]:
# Display data.
for i in range(3):
 survived, pclass, name, sex, age, fare = sess.run(d)
 print(survived)
 print(pclass)
 print(name)
 print(sex)
 print(age)
 print(fare)
 print("")

[1 0]
[3 1]
['Lam, Mr. Ali' 'Widener, Mr. Harry Elkins']
['male' 'male']
[ 0. 27.]
[ 56.4958 211.5 ]

[0 1]
[1 1]
['Baumann, Mr. John D' 'Daly, Mr. Peter Denis ']
['male' 'male']
[ 0. 51.]
[25.925 26.55 ]

[0 1]
[3 1]
['Assam, Mr. Ali' 'Newell, Miss. Madeleine']
['male' 'female']
[23. 31.]
[ 7.05 113.275]



### Load Images

Build a data pipeline by loading images from disk. For this example, Oxford Flowers dataset will be used.

In [None]:
# Download Oxford 17 flowers dataset.
d = requests.get("http://www.robots.ox.ac.uk/~vgg/data/flowers/17/17flowers.tgz")
with open("17flowers.tgz", "wb") as f:
 f.write(d.content)
# Extract archive.
with tarfile.open("17flowers.tgz") as t:
 t.extractall()

In [None]:
# Create a file to list all images path and their corresponding label.
with open('jpg/dataset.csv', 'w') as f:
 c = 0
 for i in range(1360):
 f.write("jpg/image_%04i.jpg,%i\n" % (i+1, c))
 if (i+1) % 80 == 0:
 c += 1

In [None]:
with tf.Graph().as_default():
 
 # Load Images.
 with open("jpg/dataset.csv") as f:
 dataset_file = f.read().splitlines()
 
 # Create TF session.
 sess = tf.Session()

 # Load the whole dataset file, and slice each line.
 data = tf.data.Dataset.from_tensor_slices(dataset_file)
 # Refill data indefinitely.
 data = data.repeat()
 # Shuffle data.
 data = data.shuffle(buffer_size=1000)

 # Load and pre-process images.
 def load_image(path):
 # Read image from path.
 image = tf.io.read_file(path)
 # Decode the jpeg image to array [0, 255].
 image = tf.image.decode_jpeg(image)
 # Resize images to a common size of 256x256.
 image = tf.image.resize(image, [256, 256])
 # Rescale values to [-1, 1].
 image = 1. - image / 127.5
 return image
 # Decode each line from the dataset file.
 def parse_records(line):
 # File is in csv format: "image_path,label_id".
 # TensorFlow requires a default value, but it will never be used.
 image_path, image_label = tf.io.decode_csv(line, ["", 0])
 # Apply the function to load images.
 image = load_image(image_path)
 return image, image_label
 # Use 'map' to apply the above functions in parallel.
 data = data.map(parse_records, num_parallel_calls=4)

 # Batch data (aggregate images-array together).
 data = data.batch(batch_size=2)
 # Prefetch batch (pre-load batch for faster consumption).
 data = data.prefetch(buffer_size=1)
 
 # Create an iterator over the dataset.
 iterator = data.make_initializable_iterator()
 # Initialize the iterator.
 sess.run(iterator.initializer)

 # Get next data batch.
 d = iterator.get_next()

In [None]:
# Display data.
for i in range(1):
 batch_x, batch_y = sess.run(d)
 print(batch_x, batch_y)

[[[[ 0.1294117 0.05098033 0.46666664]
 [ 0.1368872 0.05098033 0.48909312]
 [ 0.0931372 0.0068627 0.46029407]
 ...
 [ 0.23480386 0.0522058 0.6102941 ]
 [ 0.12696075 -0.05416667 0.38063723]
 [-0.10024512 -0.28848052 0.10367644]]

 [[ 0.04120708 -0.06118262 0.36256123]
 [ 0.08009624 -0.02229345 0.41640145]
 [ 0.06797445 -0.04132879 0.41923058]
 ...
 [ 0.2495715 0.06697345 0.6251221 ]
 [ 0.12058818 -0.06094813 0.37577546]
 [-0.05184889 -0.24009418 0.16777915]]

 [[-0.09234071 -0.22738981 0.20484066]
 [-0.03100491 -0.17312062 0.2811274 ]
 [ 0.01051998 -0.13237214 0.3376838 ]
 ...
 [ 0.27787983 0.07494056 0.64203525]
 [ 0.11533964 -0.09005249 0.3869906 ]
 [-0.02704227 -0.23958337 0.19454747]]

 ...

 [[ 0.07913595 -0.13069856 0.29874384]
 [ 0.10140878 -0.09445572 0.35912937]
 [ 0.08869672 -0.08415675 0.41446364]
 ...
 [ 0.25821072 0.22463232 0.69197303]
 [ 0.31636214 0.25750512 0.79362744]
 [ 0.09552741 0.01709598 0.57395875]]

 [[ 0.09019601 -0.12156868 0.3098039 ]
 [ 0.17446858 -0.02271283

### Load data from a Generator

Build a data pipeline from a custom generator. For this example, a toy generator yielding random string, vector and it is used.

In [None]:
# Create a dummy generator.
def generate_features():
 # Function to generate a random string.
 def random_string(length):
 return ''.join(random.choice(string.ascii_letters) for m in xrange(length))
 # Return a random string, a random vector, and a random int.
 yield random_string(4), np.random.uniform(size=4), random.randint(0, 10)

In [None]:
with tf.Graph().as_default():

 # Create TF session.
 sess = tf.Session()

 # Create TF dataset from the generator.
 data = tf.data.Dataset.from_generator(generate_features, output_types=(tf.string, tf.float32, tf.int32))
 # Refill data indefinitely.
 data = data.repeat()
 # Shuffle data.
 data = data.shuffle(buffer_size=100)
 # Batch data (aggregate records together).
 data = data.batch(batch_size=4)
 # Prefetch batch (pre-load batch for faster consumption).
 data = data.prefetch(buffer_size=1)

 # Create an iterator over the dataset.
 iterator = data.make_initializable_iterator()
 # Initialize the iterator.
 sess.run(iterator.initializer)

 # Get next data batch.
 d = iterator.get_next()

In [None]:
# Display data.
for i in range(5):
 batch_str, batch_vector, batch_int = sess.run(d)
 print(batch_str, batch_vector, batch_int)

['AvCS' 'kAaI' 'QwGX' 'IWOI'] [[0.6096093 0.32192084 0.26622605 0.70250475]
 [0.72534287 0.7637426 0.19977213 0.74121326]
 [0.6930984 0.09409562 0.4063325 0.5002103 ]
 [0.05160935 0.59411395 0.276416 0.98264974]] [1 3 5 6]
['EXjS' 'brvx' 'kwNz' 'eFOb'] [[0.34355283 0.26881003 0.70575935 0.7503411 ]
 [0.9584373 0.27466875 0.27802315 0.9563204 ]
 [0.19129485 0.07014314 0.0932724 0.20726128]
 [0.28744072 0.81736153 0.37507302 0.8984588 ]] [1 9 7 0]
['vpSa' 'UuqW' 'xaTO' 'milw'] [[0.2942028 0.8228986 0.5793326 0.16651365]
 [0.28259405 0.599063 0.2922477 0.95071274]
 [0.23645316 0.00258607 0.06772221 0.7291911 ]
 [0.12861755 0.31435087 0.576638 0.7333119 ]] [3 5 8 4]
['UBBb' 'MUXs' 'nLJB' 'OBGl'] [[0.2677402 0.17931737 0.02607645 0.85898155]
 [0.58647937 0.727203 0.13329858 0.8898983 ]
 [0.13872191 0.47390288 0.7061665 0.08478573]
 [0.3786016 0.22002582 0.91989636 0.45837343]] [ 5 8 0 10]
['kiiz' 'bQYG' 'WpUU' 'AuIY'] [[0.74781317 0.13744462 0.9236441 0.63558507]
 [0.23649399 0.35303807 0.0