Browse Source

Merge pull request #13916 from xwang2713/HPCC-24153-add-ml-docker-images

HPCC-24153 add three Machine Learning Docker images build files

Reviewed-By: Jake Smith <jake.smith@lexisnexis.com>
Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 4 years ago
parent
commit
3b8d8505c4

+ 68 - 10
dockerfiles/buildall.sh

@@ -27,6 +27,13 @@ BUILD_USER=hpcc-systems                         # The github repo owner
 BUILD_TYPE=                                     # Set to Debug for a debug build, leave blank for default (RelWithDebInfo)
 USE_CPPUNIT=1
 
+#BUILD_ML=all #ml,gnn,gnn-gpu
+ml_features=(
+  'ml'
+  'gnn'
+  'gnn-gpu'
+)
+
 # These values are set in a GitHub workflow build
 
 [[ -n ${INPUT_BUILD_USER} ]] && BUILD_USER=${INPUT_BUILD_USER}
@@ -86,23 +93,74 @@ build_image() {
        --build-arg USE_CPPUNIT=${USE_CPPUNIT} \
        --build-arg BUILD_THREADS=${BUILD_THREADS} \
        ${name}/ 
-    if [ "$LATEST" = "1" ] ; then
-      docker tag hpccsystems/${name}:${label} hpccsystems/${name}:latest
-      if [ "$PUSH" = "1" ] ; then
-        docker push hpccsystems/${name}:${label}
-        docker push hpccsystems/${name}:latest
-      fi
-    else
-      if [ "$PUSH" = "1" ] ; then
-        docker push hpccsystems/${name}:${label}
-      fi
+  fi
+  push_image $name $label
+}
+
+push_image() {
+  local name=$1
+  local label=$2
+  if [ "$LATEST" = "1" ] ; then
+    docker tag hpccsystems/${name}:${label} hpccsystems/${name}:latest
+    if [ "$PUSH" = "1" ] ; then
+      docker push hpccsystems/${name}:${label}
+      docker push hpccsystems/${name}:latest
     fi
+  else
+    if [ "$PUSH" = "1" ] ; then
+      docker push hpccsystems/${name}:${label}
+    fi
+  fi
+}
+
+build_ml_image() {
+  [ -z "$BUILD_ML" ] && return
+  features=()
+  if [ "$BUILD_ML" = "all" ]
+  then
+    features=(${ml_features[@]})
+  else
+    for feature in ${BUILD_ML}
+    do
+      found=false
+      for ml_feature in ${ml_features[@]}
+      do
+        if [[ $ml_feature == $feature ]]
+	then
+	  features+=(${feature})
+	  found=true
+	  break
+        fi
+      done
+      if [ "$found" = "false" ]
+      then
+	printf "\nUnknown ML feature %s\n" "$feature"
+      fi
+    done
   fi
+
+  for feature in ${features[@]}
+  do
+     echo "build_ml $feature"
+     build_ml $feature
+  done
+
+}
+
+build_ml() {
+  local name=$1
+  local label=$2
+  [[ -z ${label} ]] && label=$BUILD_LABEL
+  docker image build -t hpccsystems/platform-${name}:${label} \
+     --build-arg DOCKER_REPO=hpccsystems \
+     --build-arg BUILD_LABEL=${label} \
+     ml/${name}/
 }
 
 build_image platform-build-base ${BASE_VER}
 build_image platform-build
 build_image platform-core
+build_ml_image
 
 if [[ -n ${INPUT_PASSWORD} ]] ; then
   echo "::set-output name=${BUILD_LABEL}"

+ 53 - 0
dockerfiles/ml/README.md

@@ -0,0 +1,53 @@
+# HPCC Systems Machine Learning Docker Images
+
+## Current Machine Learning Features
+
+- ml:  Scikit-Learning
+- gnn:     Tensorflow 2 + Scikit-Learning
+- gnn-gpu: Tensorflow 2 with GPU support + Scikit-Learning
+
+## GNN and Tensorflow
+
+Current HPCC Systems GNN officially support Tensorflow version 1.x. But latest HPCC Systems Docker images are based on Ubuntu 20.04 and linked Python 3.8 libraries which doesn't support Tensorflow version 1.x.  To use Tensorflow 2 with GNN creating Tensorflow session as:
+
+```code
+import tensorflow as tf
+s = tf.compat.v1.Session()
+```
+
+## Tensorflow 2 with GPU support
+
+The new Tensorflow 2 and Nvidia Cuda libraries are not always compatible. It will be ideal if we can use Docker image by published by Tensorflow but the base image is Ubuntu 18.04 and we use Ubuntu 20.04. So instead we create the images by addiing Tensorflow 2 and Nvida Cuda, etc libraries on the top of hpccystems/platform-core image.
+
+When preparing the Dockerfile, specially for Tensorflow 2 with GPU support it is import to reference
+Tensorflow and Nvidia Cuda Dockerfiles and Docker images to pick compatible libabries.
+
+### Docker image and Dockerfile Reference
+
+#### Tensforflow
+
+- Dockerfile:
+  https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+- tensorflow/tensorflow:2.2.0-gpu
+   https://hub.docker.com/layers/tensorflow/tensorflow/2.2.0-gpu/images/sha256-3f8f06cdfbc09c54568f191bbc54419b348ecc08dc5e031a53c22c6bba0a252e?context=explore
+
+#### Nvidia Cuda
+
+- NVDA CUDA Dockerfile: https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/ubuntu18.04/10.1/base/Dockerfile
+- nvidia/cuda:10.1-base-ubuntu18.04:
+  https://hub.docker.com/layers/nvidia/cuda/10.1-base-ubuntu18.04/images/sha256-3cb86d1437161ef6998c4a681f2ca4150368946cc8e09c5e5178e3598110539f?context=explore
+
+## Build
+
+Make sure desired hpccsystems/platform-core Docker image is avaiable otherwise this need be build first with buildall.sh script.
+
+Go to ml images under <HPCC Platform>/dockerfiles/ml directory:
+
+```console
+./build -t <platform-core tag> -m <one of ml, gnn and gnn-gpu>
+```
+
+You can provide "-l" to build without version information as a default image.
+You need manually push the image to Docker repository such as Docker Hub.
+
+Machine Learning features can also be built with buildall.sh when environment variable BUILD_ML is defined. To build all set BUILD_ML=all. To build individual or subset set, for exampl, BUILD_ML=gnn or BUILD_ML="gnn gnn-gpu".

+ 87 - 0
dockerfiles/ml/build.sh

@@ -0,0 +1,87 @@
+#!/bin/bash
+##############################################################################
+#
+#    HPCC SYSTEMS software Copyright (C) 2020 HPCC Systems® .
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+##############################################################################
+
+# Use this script to build local images for Machine Learning HPCC Systems Docker images
+#
+
+usage()
+{
+  echo "Usage: build.sh [options]"
+  echo "    -h     Display help"
+  echo "    -l     Tag the images as the latest"
+  echo "    -m     ML feature: one of ml, gnn and gnn-gpu"
+  echo "    -t     Tag of base image hpccsystems/platform-core"
+  exit
+}
+LABEL=
+FEATURE=
+while getopts “hlm:t:” opt; do
+  case $opt in
+    l) TAGLATEST=1 ;;
+    m) FEATURE=$OPTARG ;;
+    t) LABEL=$OPTARG ;;
+    h) usage   ;;
+  esac
+done
+shift $(( $OPTIND-1 ))
+
+[[ -z ${FEATURE} ]] && usage
+
+ml_features=(
+  'ml'
+  'gnn'
+  'gnn-gpu'
+)
+
+found="false"
+for ml_feature in ${ml_features[@]}
+do
+  if [[ $ml_feature ==  $FEATURE ]]
+  then
+    found="true"
+    break
+  fi
+done
+
+if [[ "$found" == "false" ]]
+then
+	echo "Unknown ML feature $FEATURE"
+fi
+
+
+[[ -z ${LABEL} ]] && LABEL=latest
+
+
+
+build_image()
+{
+  name=$1
+  docker image build -t hpccsystems/platform-${name}:${LABEL} \
+     --build-arg DOCKER_REPO=hpccsystems \
+     --build-arg BUILD_LABEL=${LABEL} \
+     ${name}/
+
+  if [ "$TAGLATEST" = "1" ] && [ "${LABEL}" != "latest" ]; then
+     docker tag hpccsystems/platform-${name}:${LABEL}  hpccsystems/platform-${name}
+  fi
+
+}
+
+echo .
+echo "build_image $FEATURE"
+build_image $FEATURE

+ 128 - 0
dockerfiles/ml/gnn-gpu/Dockerfile

@@ -0,0 +1,128 @@
+##############################################################################
+#
+#    HPCC SYSTEMS software Copyright (C) 2020 HPCC Systems®.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+##############################################################################
+
+# Create base container image to be used by all HPCC processes
+# MORE - some of these dependencies are probably not needed by all derived containers - perhaps we should move them
+# Others may not be wanted at all in container mode - tensoflow and nvidia  example??
+
+ARG BUILD_LABEL
+FROM hpccsystems/platform-core:${BUILD_LABEL}
+USER root
+
+
+# nvidia/cuda bsae
+RUN apt-get update && apt-get install -y --no-install-recommends \
+gnupg2 curl ca-certificates && \
+    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
+    apt-get purge --autoremove -y curl && \
+rm -rf /var/lib/apt/lists/*
+
+ENV CUDA_VERSION 10.1.243
+
+ENV CUDA_PKG_VERSION 10-1=$CUDA_VERSION-1
+
+# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        cuda-cudart-$CUDA_PKG_VERSION \
+cuda-compat-10-1 && \
+ln -s cuda-10.1 /usr/local/cuda && \
+    rm -rf /var/lib/apt/lists/*
+
+# Required for nvidia-docker v1
+RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+
+RUN apt clean && \
+    apt autoclean && \
+    apt install -f && \
+    apt autoremove && \
+    apt-get update
+
+RUN apt-get install -y python3-pip --fix-missing
+RUN python3 -m pip install --upgrade pip
+
+RUN pip3 install       \
+    scikit-learn       \
+    statsmodels        \
+    networkx
+
+# TensorFlow with GPU support
+# Reference: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+
+ARG CUDA=10.1
+ARG CUDNN=7.6.4.38-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+ARG LIBNVINFER=6.0.1-1
+ARG LIBNVINFER_MAJOR_VERSION=6
+
+# Needed for string substitution
+SHELL ["/bin/bash", "-c"]
+# Pick up some TF dependencies
+RUN apt-get update -y && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        # There appears to be a regression in libcublas10=10.2.2.89-1 which
+        # prevents cublas from initializing in TF. See
+        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
+        libcublas10=10.2.1.243-1 \
+        cuda-nvrtc-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
+# dynamic linker run-time bindings
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
+    && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
+    && ldconfig
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+USER hpcc

+ 37 - 0
dockerfiles/ml/gnn/Dockerfile

@@ -0,0 +1,37 @@
+##############################################################################
+#
+#    HPCC SYSTEMS software Copyright (C) 2020 HPCC Systems®.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+##############################################################################
+
+# Create base container image to be used by all HPCC processes
+# MORE - some of these dependencies are probably not needed by all derived containers - perhaps we should move them
+# Others may not be wanted at all in container mode - tensorflow, etc  ??
+
+ARG BUILD_LABEL
+FROM hpccsystems/platform-core:${BUILD_LABEL}
+USER root
+
+RUN apt-get install -y python3-pip --fix-missing
+RUN python3 -m pip install --upgrade pip
+
+RUN pip3 install       \
+    scikit-learn       \
+    statsmodels        \
+    networkx           \
+    setuptools==46.1.3 \
+    scipy==1.4.1       \
+    tensorflow==2.2.0
+
+USER hpcc

+ 32 - 0
dockerfiles/ml/ml/Dockerfile

@@ -0,0 +1,32 @@
+##############################################################################
+#
+#    HPCC SYSTEMS software Copyright (C) 2020 HPCC Systems®.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+##############################################################################
+
+# Create base container image to be used by all HPCC processes
+# MORE - some of these dependencies are probably not needed by all derived containers - perhaps we should move them
+# Others may not be wanted at all in container mode - scikit example??
+
+ARG BUILD_LABEL
+FROM hpccsystems/platform-core:${BUILD_LABEL}
+USER root
+
+# Machine Learning Dependencies
+RUN apt-get install -y python3 python3-pip --fix-missing
+RUN python3 -m pip install --upgrade pip
+RUN pip3 install       \
+    scikit-learn
+
+USER hpcc