{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load and parse data with TensorFlow 2.0 (tf.data)\n",
    "\n",
    "A TensorFlow 2.0 example to build input pipelines for loading data efficiently.\n",
    "\n",
    "\n",
    "- Numpy Arrays\n",
    "- Images\n",
    "- CSV file\n",
    "- Custom data from a Generator\n",
    "\n",
    "For more information about creating and loading TensorFlow's `TFRecords` data format, see: [tfrecords.ipynb](tfrecords.ipynb)\n",
    "\n",
    "- Author: Aymeric Damien\n",
    "- Project: https://github.com/aymericdamien/TensorFlow-Examples/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import absolute_import, division, print_function\n",
    "\n",
    "import numpy as np\n",
    "import random\n",
    "import requests\n",
    "import string\n",
    "import tarfile\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load Numpy Arrays\n",
    "\n",
    "Build a data pipeline over numpy arrays."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a toy dataset (even and odd numbers, with respective labels of 0 and 1).\n",
    "evens = np.arange(0, 100, step=2, dtype=np.int32)\n",
    "evens_label = np.zeros(50, dtype=np.int32)\n",
    "odds = np.arange(1, 100, step=2, dtype=np.int32)\n",
    "odds_label = np.ones(50, dtype=np.int32)\n",
    "# Concatenate arrays\n",
    "features = np.concatenate([evens, odds])\n",
    "labels = np.concatenate([evens_label, odds_label])\n",
    "\n",
    "# Load a numpy array using tf data api with `from_tensor_slices`.\n",
    "data = tf.data.Dataset.from_tensor_slices((features, labels))\n",
    "# Refill data indefinitely.  \n",
    "data = data.repeat()\n",
    "# Shuffle data.\n",
    "data = data.shuffle(buffer_size=100)\n",
    "# Batch data (aggregate records together).\n",
    "data = data.batch(batch_size=4)\n",
    "# Prefetch batch (pre-load batch for faster consumption).\n",
    "data = data.prefetch(buffer_size=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor([ 9 94 29 85], shape=(4,), dtype=int32) tf.Tensor([1 0 1 1], shape=(4,), dtype=int32)\n",
      "tf.Tensor([68 57 88 41], shape=(4,), dtype=int32) tf.Tensor([0 1 0 1], shape=(4,), dtype=int32)\n",
      "tf.Tensor([51 19 18 56], shape=(4,), dtype=int32) tf.Tensor([1 1 0 0], shape=(4,), dtype=int32)\n",
      "tf.Tensor([70 84 99 32], shape=(4,), dtype=int32) tf.Tensor([0 0 1 0], shape=(4,), dtype=int32)\n",
      "tf.Tensor([40  0 25 28], shape=(4,), dtype=int32) tf.Tensor([0 0 1 0], shape=(4,), dtype=int32)\n"
     ]
    }
   ],
   "source": [
    "for batch_x, batch_y in data.take(5):\n",
    "    print(batch_x, batch_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor([ 9 94 29 85], shape=(4,), dtype=int32) tf.Tensor([1 0 1 1], shape=(4,), dtype=int32)\n",
      "tf.Tensor([68 57 88 41], shape=(4,), dtype=int32) tf.Tensor([0 1 0 1], shape=(4,), dtype=int32)\n",
      "tf.Tensor([51 19 18 56], shape=(4,), dtype=int32) tf.Tensor([1 1 0 0], shape=(4,), dtype=int32)\n",
      "tf.Tensor([70 84 99 32], shape=(4,), dtype=int32) tf.Tensor([0 0 1 0], shape=(4,), dtype=int32)\n",
      "tf.Tensor([40  0 25 28], shape=(4,), dtype=int32) tf.Tensor([0 0 1 0], shape=(4,), dtype=int32)\n",
      "tf.Tensor([20 38 22 79], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)\n",
      "tf.Tensor([20 22 96 27], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)\n",
      "tf.Tensor([34 58 86 67], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)\n",
      "tf.Tensor([ 2 98 24 21], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)\n",
      "tf.Tensor([16 45 18 35], shape=(4,), dtype=int32) tf.Tensor([0 1 0 1], shape=(4,), dtype=int32)\n"
     ]
    }
   ],
   "source": [
    "# Note: If you are planning on calling multiple time,\n",
    "# you can user the iterator way:\n",
    "ite_data = iter(data)\n",
    "for i in range(5):\n",
    "    batch_x, batch_y = next(ite_data)\n",
    "    print(batch_x, batch_y)\n",
    "\n",
    "for i in range(5):\n",
    "    batch_x, batch_y = next(ite_data)\n",
    "    print(batch_x, batch_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load CSV files\n",
    "\n",
    "Build a data pipeline from features stored in a CSV file. For this example, Titanic dataset will be used as a toy dataset stored in CSV format."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Titanic Dataset\n",
    "\n",
    "\n",
    "\n",
    "survived|pclass|name|sex|age|sibsp|parch|ticket|fare\n",
    "--------|------|----|---|---|-----|-----|------|----\n",
    "1|1|\"Allen, Miss. Elisabeth Walton\"|female|29|0|0|24160|211.3375\n",
    "1|1|\"Allison, Master. Hudson Trevor\"|male|0.9167|1|2|113781|151.5500\n",
    "0|1|\"Allison, Miss. Helen Loraine\"|female|2|1|2|113781|151.5500\n",
    "0|1|\"Allison, Mr. Hudson Joshua Creighton\"|male|30|1|2|113781|151.5500\n",
    "...|...|...|...|...|...|...|...|..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download Titanic dataset (in csv format).\n",
    "d = requests.get(\"https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/titanic_dataset.csv\")\n",
    "with open(\"titanic_dataset.csv\", \"wb\") as f:\n",
    "    f.write(d.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Titanic dataset.\n",
    "# Original features: survived,pclass,name,sex,age,sibsp,parch,ticket,fare\n",
    "# Select specific columns: survived,pclass,name,sex,age,fare\n",
    "column_to_use = [0, 1, 2, 3, 4, 8]\n",
    "record_defaults = [tf.int32, tf.int32, tf.string, tf.string, tf.float32, tf.float32]\n",
    "\n",
    "# Load the whole dataset file, and slice each line.\n",
    "data = tf.data.experimental.CsvDataset(\"titanic_dataset.csv\", record_defaults, header=True, select_cols=column_to_use)\n",
    "# Refill data indefinitely.\n",
    "data = data.repeat()\n",
    "# Shuffle data.\n",
    "data = data.shuffle(buffer_size=1000)\n",
    "# Batch data (aggregate records together).\n",
    "data = data.batch(batch_size=2)\n",
    "# Prefetch batch (pre-load batch for faster consumption).\n",
    "data = data.prefetch(buffer_size=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 1]\n",
      "[2 2]\n",
      "['Richards, Master. George Sibley' 'Rugg, Miss. Emily']\n",
      "['male' 'female']\n",
      "[ 0.8333 21.    ]\n",
      "[18.75 10.5 ]\n"
     ]
    }
   ],
   "source": [
    "for survived, pclass, name, sex, age, fare in data.take(1):\n",
    "    print(survived.numpy())\n",
    "    print(pclass.numpy())\n",
    "    print(name.numpy())\n",
    "    print(sex.numpy())\n",
    "    print(age.numpy())\n",
    "    print(fare.numpy())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load Images\n",
    "\n",
    "Build a data pipeline by loading images from disk. For this example, Oxford Flowers dataset will be used."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download Oxford 17 flowers dataset\n",
    "d = requests.get(\"http://www.robots.ox.ac.uk/~vgg/data/flowers/17/17flowers.tgz\")\n",
    "with open(\"17flowers.tgz\", \"wb\") as f:\n",
    "    f.write(d.content)\n",
    "# Extract archive.\n",
    "with tarfile.open(\"17flowers.tgz\") as t:\n",
    "    t.extractall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('jpg/dataset.csv', 'w') as f:\n",
    "    c = 0\n",
    "    for i in range(1360):\n",
    "        f.write(\"jpg/image_%04i.jpg,%i\\n\" % (i+1, c))\n",
    "        if (i+1) % 80 == 0:\n",
    "            c += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Images\n",
    "with open(\"jpg/dataset.csv\") as f:\n",
    "    dataset_file = f.read().splitlines()\n",
    "\n",
    "# Load the whole dataset file, and slice each line.\n",
    "data = tf.data.Dataset.from_tensor_slices(dataset_file)\n",
    "# Refill data indefinitely.\n",
    "data = data.repeat()\n",
    "# Shuffle data.\n",
    "data = data.shuffle(buffer_size=1000)\n",
    "\n",
    "# Load and pre-process images.\n",
    "def load_image(path):\n",
    "    # Read image from path.\n",
    "    image = tf.io.read_file(path)\n",
    "    # Decode the jpeg image to array [0, 255].\n",
    "    image = tf.image.decode_jpeg(image)\n",
    "    # Resize images to a common size of 256x256.\n",
    "    image = tf.image.resize(image, [256, 256])\n",
    "    # Rescale values to [-1, 1].\n",
    "    image = 1. - image / 127.5\n",
    "    return image\n",
    "# Decode each line from the dataset file.\n",
    "def parse_records(line):\n",
    "    # File is in csv format: \"image_path,label_id\".\n",
    "    # TensorFlow requires a default value, but it will never be used.\n",
    "    image_path, image_label = tf.io.decode_csv(line, [\"\", 0])\n",
    "    # Apply the function to load images.\n",
    "    image = load_image(image_path)\n",
    "    return image, image_label\n",
    "# Use 'map' to apply the above functions in parallel.\n",
    "data = data.map(parse_records, num_parallel_calls=4)\n",
    "\n",
    "# Batch data (aggregate images-array together).\n",
    "data = data.batch(batch_size=2)\n",
    "# Prefetch batch (pre-load batch for faster consumption).\n",
    "data = data.prefetch(buffer_size=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(\n",
      "[[[[-0.90260804 -0.9550551  -0.9444355 ]\n",
      "   [-0.9538603  -0.9715073  -0.9136642 ]\n",
      "   [-0.41687727 -0.37570083 -0.25462234]\n",
      "   ...\n",
      "   [ 0.4617647   0.422549    0.3754902 ]\n",
      "   [ 0.4934436   0.45422792  0.4071691 ]\n",
      "   [ 0.5530829   0.5138672   0.46680838]]\n",
      "\n",
      "  [[-0.9301815  -0.98563874 -0.9595933 ]\n",
      "   [-0.9379289  -0.95557594 -0.89773285]\n",
      "   [-0.68581116 -0.6446346  -0.5305033 ]\n",
      "   ...\n",
      "   [ 0.46960783  0.43039215  0.38333333]\n",
      "   [ 0.5009191   0.46170342  0.4146446 ]\n",
      "   [ 0.56071925  0.52150357  0.4744447 ]]\n",
      "\n",
      "  [[-0.9480392  -0.9862745  -0.96889937]\n",
      "   [-0.93367803 -0.9485103  -0.8916054 ]\n",
      "   [-0.9224341  -0.9033165  -0.7915518 ]\n",
      "   ...\n",
      "   [ 0.48045343  0.44123775  0.39417893]\n",
      "   [ 0.51623774  0.47702205  0.42996323]\n",
      "   [ 0.5740809   0.5348652   0.48780638]]\n",
      "\n",
      "  ...\n",
      "\n",
      "  [[ 0.0824219   0.37201285  0.5615885 ]\n",
      "   [ 0.09744179  0.3858226   0.57758886]\n",
      "   [ 0.1170305   0.4023859   0.59906554]\n",
      "   ...\n",
      "   [ 0.02599955  0.65661     0.7460593 ]\n",
      "   [-0.0751493   0.6735256   0.7022212 ]\n",
      "   [-0.06794965  0.73861444  0.7482958 ]]\n",
      "\n",
      "  [[ 0.10942864  0.39136028  0.5135914 ]\n",
      "   [ 0.18471968  0.4658088   0.5954542 ]\n",
      "   [ 0.21578586  0.4813496   0.6320619 ]\n",
      "   ...\n",
      "   [ 0.22432214  0.676777    0.8324946 ]\n",
      "   [ 0.10089612  0.73174024  0.7959444 ]\n",
      "   [ 0.00907248  0.74025357  0.7495098 ]]\n",
      "\n",
      "  [[ 0.15197992  0.43433285  0.54413676]\n",
      "   [ 0.20049018  0.48284316  0.60343134]\n",
      "   [ 0.2664752   0.5252987   0.6713772 ]\n",
      "   ...\n",
      "   [ 0.24040669  0.6644263   0.8296224 ]\n",
      "   [ 0.10060894  0.7192364   0.78786385]\n",
      "   [ 0.05363435  0.77765393  0.78206575]]]\n",
      "\n",
      "\n",
      " [[[-0.49571514 -0.2133621   0.6807555 ]\n",
      "   [-0.52243936 -0.2322433   0.66971743]\n",
      "   [-0.5502666  -0.24438429  0.6732628 ]\n",
      "   ...\n",
      "   [-0.61084557 -0.22653186  0.7019608 ]\n",
      "   [-0.60784316 -0.21568632  0.65843004]\n",
      "   [-0.6197916  -0.22585356  0.6411722 ]]\n",
      "\n",
      "  [[-0.5225973  -0.24024439  0.6538732 ]\n",
      "   [-0.54144406 -0.26501226  0.64094764]\n",
      "   [-0.56139374 -0.27119768  0.6341878 ]\n",
      "   ...\n",
      "   [-0.6186887  -0.22824419  0.67053366]\n",
      "   [-0.59662986 -0.22015929  0.6358456 ]\n",
      "   [-0.6119485  -0.23387194  0.6130515 ]]\n",
      "\n",
      "  [[-0.54999995 -0.26764703  0.61539805]\n",
      "   [-0.56739867 -0.28504562  0.6056473 ]\n",
      "   [-0.58733106 -0.297135    0.5988358 ]\n",
      "   ...\n",
      "   [-0.62097263 -0.22653186  0.62466395]\n",
      "   [-0.60171235 -0.21739864  0.5984136 ]\n",
      "   [-0.614951   -0.23063731  0.579271  ]]\n",
      "\n",
      "  ...\n",
      "\n",
      "  [[-0.49420047 -0.25567698 -0.29812205]\n",
      "   [-0.5336498  -0.31243873 -0.34749448]\n",
      "   [-0.5600954  -0.35433567 -0.38869584]\n",
      "   ...\n",
      "   [ 0.4558211   0.22837007  0.47150737]\n",
      "   [ 0.49019605  0.24705881  0.4980392 ]\n",
      "   [ 0.5021446   0.25900733  0.5099877 ]]\n",
      "\n",
      "  [[-0.50617576 -0.29696214 -0.31009734]\n",
      "   [-0.47532892 -0.28324962 -0.28901553]\n",
      "   [-0.45759463 -0.28628123 -0.28675795]\n",
      "   ...\n",
      "   [ 0.46366423  0.2362132   0.4793505 ]\n",
      "   [ 0.4980392   0.25490195  0.5058824 ]\n",
      "   [ 0.5099877   0.26685047  0.51783085]]\n",
      "\n",
      "  [[-0.45882356 -0.254902   -0.26274514]\n",
      "   [-0.4185791  -0.23034382 -0.23034382]\n",
      "   [-0.37365198 -0.21194851 -0.20410538]\n",
      "   ...\n",
      "   [ 0.46366423  0.2362132   0.4793505 ]\n",
      "   [ 0.4980392   0.25490195  0.5058824 ]\n",
      "   [ 0.5099877   0.26685047  0.51783085]]]], shape=(2, 256, 256, 3), dtype=float32) tf.Tensor([8 8], shape=(2,), dtype=int32)\n"
     ]
    }
   ],
   "source": [
    "for batch_x, batch_y in data.take(1):\n",
    "    print(batch_x, batch_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load data from a Generator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a dummy generator.\n",
    "def generate_features():\n",
    "    # Function to generate a random string.\n",
    "    def random_string(length):\n",
    "        return ''.join(random.choice(string.ascii_letters) for m in xrange(length))\n",
    "    # Return a random string, a random vector, and a random int.\n",
    "    yield random_string(4), np.random.uniform(size=4), random.randint(0, 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load a numpy array using tf data api with `from_tensor_slices`.\n",
    "data = tf.data.Dataset.from_generator(generate_features, output_types=(tf.string, tf.float32, tf.int32))\n",
    "# Refill data indefinitely.\n",
    "data = data.repeat()\n",
    "# Shuffle data.\n",
    "data = data.shuffle(buffer_size=100)\n",
    "# Batch data (aggregate records together).\n",
    "data = data.batch(batch_size=4)\n",
    "# Prefetch batch (pre-load batch for faster consumption).\n",
    "data = data.prefetch(buffer_size=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(['snDw' 'NvMp' 'sXsw' 'qwuk'], shape=(4,), dtype=string) tf.Tensor(\n",
      "[[0.22296238 0.03515657 0.3893014  0.6875752 ]\n",
      " [0.05003363 0.27605608 0.23262134 0.10671499]\n",
      " [0.8992419  0.34516433 0.29739627 0.8413017 ]\n",
      " [0.91913974 0.7142106  0.48333576 0.04300505]], shape=(4, 4), dtype=float32) tf.Tensor([ 2 10  4  1], shape=(4,), dtype=int32)\n",
      "tf.Tensor(['vdUx' 'InFi' 'nLzy' 'oklE'], shape=(4,), dtype=string) tf.Tensor(\n",
      "[[0.6512162  0.8695475  0.7012295  0.6849636 ]\n",
      " [0.00812997 0.01264008 0.7774404  0.44849646]\n",
      " [0.92055863 0.894824   0.3628448  0.85603875]\n",
      " [0.32219294 0.9767527  0.0307372  0.12051418]], shape=(4, 4), dtype=float32) tf.Tensor([9 7 4 0], shape=(4,), dtype=int32)\n",
      "tf.Tensor(['ULGI' 'dBbm' 'URgs' 'Pkpt'], shape=(4,), dtype=string) tf.Tensor(\n",
      "[[0.39586228 0.7472     0.3759462  0.9277406 ]\n",
      " [0.44489694 0.38694733 0.9592599  0.82675934]\n",
      " [0.12597603 0.299358   0.6940909  0.34155408]\n",
      " [0.3401377  0.97620344 0.6047712  0.51667166]], shape=(4, 4), dtype=float32) tf.Tensor([ 4 10  0  0], shape=(4,), dtype=int32)\n",
      "tf.Tensor(['kvao' 'wWvG' 'vrzf' 'cMgG'], shape=(4,), dtype=string) tf.Tensor(\n",
      "[[0.8090979  0.65837437 0.9732402  0.9298921 ]\n",
      " [0.67059356 0.91655296 0.52894515 0.8964492 ]\n",
      " [0.05753202 0.45829964 0.74948853 0.41164723]\n",
      " [0.42602295 0.8696292  0.57220364 0.9475169 ]], shape=(4, 4), dtype=float32) tf.Tensor([6 7 6 2], shape=(4,), dtype=int32)\n",
      "tf.Tensor(['kyLQ' 'kxbI' 'CkQD' 'PHlJ'], shape=(4,), dtype=string) tf.Tensor(\n",
      "[[0.29089147 0.6438517  0.31005543 0.31286424]\n",
      " [0.0937152  0.8887667  0.24011584 0.25746483]\n",
      " [0.47577712 0.53731906 0.9178111  0.3249844 ]\n",
      " [0.38328    0.39294246 0.08126572 0.5995307 ]], shape=(4, 4), dtype=float32) tf.Tensor([3 1 3 2], shape=(4,), dtype=int32)\n"
     ]
    }
   ],
   "source": [
    "# Display data.\n",
    "for batch_str, batch_vector, batch_int in data.take(5):\n",
    "    print(batch_str, batch_vector, batch_int)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}