瀏覽代碼

memory analysis

John Welsh 4 年之前
父節點
當前提交
3e08490075
共有 4 個文件被更改,包括 163 次插入0 次删除
  1. 34 0
      MEMORY_ANALYSIS.md
  2. 21 0
      build_engines.sh
  3. 39 0
      run_engine.py
  4. 69 0
      run_tf.py

+ 34 - 0
MEMORY_ANALYSIS.md

@@ -0,0 +1,34 @@
+## Instructions
+
+clone project and dependencies
+
+```bash
+git clone --recursive https://github.com/NVIDIA-AI-IOT/tf_to_trt_image_classification
+cd tf_to_trt_image_classification
+```
+
+create frozen graphs for models
+
+```bash
+source scripts/download_models.sh
+python3 models_to_frozen_graphs.py
+```
+
+create engines for various models
+
+```bash
+./build_engines.sh
+```
+
+profile TensorRT for model
+
+
+```bash
+python3 run_engine.py data/engines/inception_v2_int8_bs1.engine
+```
+
+profile TensorFlow for model
+
+```bash
+python3 run_tf.py data/frozen_graphs/inception_v2.pb --allow_growth
+```

+ 21 - 0
build_engines.sh

@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# VGG16
+python build_engine.py data/frozen_graphs/vgg_16.pb data/engines/vgg_16_fp32_bs1.engine
+python build_engine.py data/frozen_graphs/vgg_16.pb data/engines/vgg_16_fp16_bs1.engine --fp16_mode
+python build_engine.py data/frozen_graphs/vgg_16.pb data/engines/vgg_16_int8_bs1.engine --int8_mode
+
+# INCEPTION V2
+python build_engine.py data/frozen_graphs/inception_v2.pb data/engines/inception_v2_fp32_bs1.engine
+python build_engine.py data/frozen_graphs/inception_v2.pb data/engines/inception_v2_fp16_bs1.engine --fp16_mode
+python build_engine.py data/frozen_graphs/inception_v2.pb data/engines/inception_v2_int8_bs1.engine --int8_mode
+
+# RESNET 50
+python build_engine.py data/frozen_graphs/resnet_v1_50.pb data/engines/resnet_v1_50_fp32_bs1.engine
+python build_engine.py data/frozen_graphs/resnet_v1_50.pb data/engines/resnet_v1_50_fp16_bs1.engine --fp16_mode
+python build_engine.py data/frozen_graphs/resnet_v1_50.pb data/engines/resnet_v1_50_int8_bs1.engine --int8_mode
+
+# MOBILENET V1
+python build_engine.py data/frozen_graphs/mobilenet_v1_1p0_224.pb data/engines/mobilenet_v1_1p0_224_fp32_bs1.engine
+python build_engine.py data/frozen_graphs/mobilenet_v1_1p0_224.pb data/engines/mobilenet_v1_1p0_224_fp16_bs1.engine --fp16_mode
+python build_engine.py data/frozen_graphs/mobilenet_v1_1p0_224.pb data/engines/mobilenet_v1_1p0_224_int8_bs1.engine --int8_mode

+ 39 - 0
run_engine.py

@@ -0,0 +1,39 @@
+import pycuda.gpuarray as gpuarray
+import pycuda.driver as cuda
+import pycuda.autoinit
+
+# get initial meomry before importing other libraries
+gpu_free, gpu_total = cuda.mem_get_info()
+gpu_used_0 = (gpu_total - gpu_free)
+
+import argparse
+import tensorrt as trt
+import numpy as np
+import time
+
+parser = argparse.ArgumentParser()
+parser.add_argument('engine_path', type=str)
+parser.add_argument('--batch_size', type=int, default=1)
+parser.add_argument('--num_runs', type=int, default=10)
+args = parser.parse_args()
+
+logger = trt.Logger(trt.Logger.INFO)
+runtime = trt.Runtime(logger)
+
+with open(args.engine_path, 'rb') as f:
+    engine = runtime.deserialize_cuda_engine(f.read())
+    
+context = engine.create_execution_context()
+
+
+input_gpu = gpuarray.to_gpu(np.zeros((args.batch_size, 3, 224, 224)).astype(np.float32))
+output_gpu = gpuarray.to_gpu(np.zeros((args.batch_size, 1000)).astype(np.float32))
+
+for i in range(args.num_runs):
+    
+    context.execute(args.batch_size, [int(input_gpu.gpudata), int(output_gpu.gpudata)])
+    
+gpu_free, gpu_total = cuda.mem_get_info()
+gpu_used_1 = (gpu_total - gpu_free)
+print('%dMB GPU MEMORY DELTA' % ((gpu_used_1 - gpu_used_0) // 1e6))
+

+ 69 - 0
run_tf.py

@@ -0,0 +1,69 @@
+import pycuda.gpuarray as gpuarray
+import pycuda.driver as cuda
+import pycuda.autoinit
+
+# get initial meomry before importing other libraries
+gpu_free, gpu_total = cuda.mem_get_info()
+gpu_used_0 = (gpu_total - gpu_free)
+
+import argparse
+import numpy as np
+import tensorflow as tf
+import time
+
+parser = argparse.ArgumentParser()
+parser.add_argument('frozen_graph', type=str)
+parser.add_argument('--batch_size', type=int, default=1)
+parser.add_argument('--num_runs', type=int, default=10)
+parser.add_argument('--allow_growth', action='store_true')
+args = parser.parse_args()
+
+# LOAD MODEL
+with open(args.frozen_graph, 'rb') as f:
+    graph_def = tf.GraphDef()
+    graph_def.ParseFromString(f.read())
+    
+with tf.Graph().as_default() as graph:
+    tf.import_graph_def(graph_def, name="")
+    
+tf_config = tf.ConfigProto()
+tf_config.gpu_options.allow_growth = args.allow_growth  # disable upfront memory allocation
+tf_config.allow_soft_placement = True
+
+if 'vgg_16' in args.frozen_graph:
+    output_name = 'vgg_16/fc8/BiasAdd'
+elif 'vgg_19' in args.frozen_graph:
+    output_name = 'vgg_19/fc8/BiasAdd'
+elif 'inception_v1' in args.frozen_graph:
+    output_name = 'InceptionV1/Logits/SpatialSqueeze'
+elif 'inception_v2' in args.frozen_graph:
+    output_name = 'InceptionV2/Logits/SpatialSqueeze'
+elif 'resnet_v1_50' in args.frozen_graph:
+    output_name = 'resnet_v1_50/SpatialSqueeze'
+elif 'resnet_v1_101' in args.frozen_graph:
+    output_name = 'resnet_v1_101/SpatialSqueeze'
+elif 'resnet_v1_152' in args.frozen_graph:
+    output_name = 'resnet_v1_152/SpatialSqueeze'
+elif 'mobilenet_v1_1p0_224' in args.frozen_graph:
+    output_name = 'MobilenetV1/Logits/SpatialSqueeze'
+else:
+    raise RuntimeError('Could not find output name for model.')
+
+
+with tf.Session(config=tf_config, graph=graph) as tf_sess:
+    tf_input = tf_sess.graph.get_tensor_by_name('input' + ':0')
+    tf_output = tf_sess.graph.get_tensor_by_name(output_name + ':0')
+            
+            
+    input = np.zeros((args.batch_size, 224, 224, 3)).astype(np.float32)
+
+    for i in range(args.num_runs):
+
+        output = tf_sess.run([tf_output], feed_dict={
+                            tf_input: input
+        })[0]
+
+    gpu_free, gpu_total = cuda.mem_get_info()
+    gpu_used_1 = (gpu_total - gpu_free)
+    print('%dMB GPU MEMORY DELTA' % ((gpu_used_1 - gpu_used_0) // 1e6))
+