Ver código fonte

Add config files to allow switching between platforms or configurations

Luke Hare 3 anos atrás
pai
commit
ff0474f53f

+ 10 - 0
workflows/sciml-bench/singularity_config_files/cu10_config.sh

@@ -0,0 +1,10 @@
+export OMPI4_CONTAINER=ompi4-config.sif
+export CUDA_BASE_IMAGE="nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04"
+export SINGULARITYENV_VERBS=without-verbs
+
+export SINGULARITYENV_TENSORFLOW_VERSION=2.3.0
+export SINGULARITYENV_PYTORCH_VERSION=1.7.1+cu101
+export SINGULARITYENV_PIP_CMD_1="pip install torch==${SINGULARITYENV_PYTORCH_VERSION} torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html"
+export SINGULARITYENV_PIP_CMD_2="pip install mxnet-cu101 tensorflow-gpu==${SINGULARITYENV_TENSORFLOW_VERSION} keras h5py filelock matplotlib scikit-learn"
+export SINGULARITYENV_LDCONFIG_PATH="/usr/local/cuda-10.1/targets/x86_64-linux/lib/stubs"
+export SINGULARITYENV_PIP_CMD_3="HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod"

+ 10 - 0
workflows/sciml-bench/singularity_config_files/cu11_config.sh

@@ -0,0 +1,10 @@
+export OMPI4_CONTAINER=ompi4-config.sif
+export CUDA_BASE_IMAGE="nvidia/cuda:11.4.2-cudnn8-devel-ubuntu18.04"
+export SINGULARITYENV_VERBS=without-verbs
+
+export SINGULARITYENV_TENSORFLOW_VERSION=2.7.0
+export SINGULARITYENV_PYTORCH_VERSION=1.10.0+cu113
+export SINGULARITYENV_PIP_CMD_1="pip install torch==${SINGULARITYENV_PYTORCH_VERSION} torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html"
+export SINGULARITYENV_PIP_CMD_2="pip install mxnet-cu112 tensorflow-gpu==${SINGULARITYENV_TENSORFLOW_VERSION} keras h5py filelock matplotlib scikit-learn"
+export SINGULARITYENV_LDCONFIG_PATH="/usr/local/cuda-11.4/targets/x86_64-linux/lib/stubs"
+export SINGULARITYENV_PIP_CMD_3="HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod "

+ 99 - 0
workflows/sciml-bench/singularity_config_files/ompi-body.txt

@@ -0,0 +1,99 @@
+Stage: spython-base
+
+%post
+
+export SINGULARITY_BINDPATH=$SINGULARITY_BINDPATH,$(echo /usr/bin/ | sed -e 's/ /,/g')
+
+MLNX_ARCH=ubuntu18.04-x86_64
+MLNX_VERSION=5.2-1.0.4.0
+
+# Python 3.7 is supported by Ubuntu Bionic out of the box
+python=3.7
+PYTHON_VERSION=${python}
+
+# Set default shell to /bin/bash
+#SHELL ["/bin/bash", "-cu"]
+
+TZ="Europe/London"
+
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+build-essential \
+cmake \
+g++-7 \
+git \
+curl \
+vim \
+wget \
+ca-certificates \
+python${PYTHON_VERSION} \
+python${PYTHON_VERSION}-dev \
+python${PYTHON_VERSION}-distutils \
+libpython-stdlib \
+python \
+udev \
+automake \
+gfortran \
+autotools-dev \
+chrpath \
+pkg-config \
+libnl-3-dev \
+tcl \
+pciutils \
+tk \
+libnl-route-3-dev \
+libltdl-dev \
+bison \
+autoconf \
+flex \
+graphviz \
+libssl1.0.0 \
+kmod \
+debhelper \
+libgfortran4 \
+ethtool \
+swig \
+lsof \
+libnl-route-3-200 \
+m4 \
+libnl-3-200 \
+dpatch \
+libnuma-dev \
+iputils-ping \
+iproute2 \
+apt-utils
+
+ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python
+
+curl -O https://bootstrap.pypa.io/get-pip.py && \
+python get-pip.py && \
+rm get-pip.py
+
+wget http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_VERSION}/MLNX_OFED_LINUX-${MLNX_VERSION}-${MLNX_ARCH}.tgz && \
+tar -xzvf MLNX_OFED_LINUX-${MLNX_VERSION}-${MLNX_ARCH}.tgz --no-same-owner && \
+cd  MLNX_OFED_LINUX-${MLNX_VERSION}-${MLNX_ARCH} && \
+./mlnxofedinstall --user-space-only --without-fw-update --skip-repo --force && \
+cd .. && \
+rm -rf MLNX_OFED-LINUX-${MLNX_VERSION}-${MLNX_ARCH} && \
+rm -rf *.tgz
+
+# Install Open MPI version 4.1
+wget https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-4.1.0.tar.gz && \
+tar zxf openmpi-4.1.0.tar.gz && \
+cd openmpi-4.1.0 && \
+./configure --enable-mpirun-prefix-by-default \
+            --enable-shared \
+            --${VERBS} && \
+make -j $(nproc) all && \
+make install && \
+ldconfig && \
+rm -rf /tmp/openmpi
+
+cd /
+pip --no-cache-dir --disable-pip-version-check install mpi4py
+
+LD_LIBRARY_PATH=/usr/local/lib/:/usr/local/lib/openmpi:$LD_LIBRARY_PATH
+
+%environment
+
+%runscript

+ 3 - 0
workflows/sciml-bench/singularity_config_files/ompi-config-builder.sh

@@ -0,0 +1,3 @@
+{ (envsubst < ompi-header.txt) & (cat ompi-body.txt); } > tmp_file.def 2>/dev/null
+sudo -E singularity build ${OMPI4_CONTAINER} tmp_file.def
+rm tmp_file.def

+ 2 - 0
workflows/sciml-bench/singularity_config_files/ompi-header.txt

@@ -0,0 +1,2 @@
+Bootstrap: docker
+From: ${CUDA_BASE_IMAGE}

+ 85 - 0
workflows/sciml-bench/singularity_config_files/sciml-config-body.txt

@@ -0,0 +1,85 @@
+
+%files
+# -----------------------------------------------------------------------------------
+    ../sciml-bench/requirements.txt /sciml-benchmarks/requirements.txt
+    ../sciml-bench/MANIFEST.in /sciml-benchmarks/MANIFEST.in
+    ../sciml-bench/setup.py /sciml-benchmarks/setup.py
+    ../sciml-bench/doc /sciml-benchmarks/doc
+    ../sciml-bench/sciml_bench /sciml-benchmarks/sciml_bench
+
+
+%environment
+# -----------------------------------------------------------------------------------
+
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export LC_ALL=C
+    export HOROVOD_GPU_ALLREDUCE=NCCL
+    export HOROVOD_GPU_ALLGATHER=MPI
+    export HOROVOD_GPU_BROADCAST=MPI
+    #export HOROVOD_NCCL_HOME=/usr/local/cuda/nccl
+    #export HOROVOD_NCCL_INCLUDE=/usr/local/cuda/nccl/include
+    #export HOROVOD_NCCL_LIB=/usr/local/cuda/nccl/lib
+    export PYTHON_VERSION=3.8
+    #export TENSORFLOW_VERSION=${TENSORFLOW_VERSION}
+    #export PYTORCH_VERSION=${PYTORCH_VERSION}
+
+%post
+# -----------------------------------------------------------------------------------
+# this will install all necessary packages and prepare the container
+
+# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
+# Python 2.7 or 3.5 is supported by Ubuntu Xenial out of the box
+
+    export PYTHON_VERSION=3.8
+    export TENSORFLOW_VERSION=${TENSORFLOW_VERSION}
+    export PYTORCH_VERSION=${PYTORCH_VERSION}
+
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
+
+    apt-get -y update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        build-essential \
+        cmake \
+        git \
+        curl \
+        vim \
+        wget \
+        ca-certificates \
+        libjpeg-dev \
+        libpng-dev \
+        python${PYTHON_VERSION} \
+        python${PYTHON_VERSION}-dev
+
+    ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python
+
+    curl -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+
+# Install TensorFlow, Keras and PyTorch
+
+    ${PIP_CMD_1}
+    ${PIP_CMD_2}
+
+    export PATH="/usr/local/cuda-11.5/bin:$PATH"
+
+
+# Install Horovod, temporarily using CUDA stubs
+    ldconfig ${LDCONFIG_PATH} && \
+    ${PIP_CMD_3} && \
+    ldconfig
+
+# Set default NCCL parameters
+    echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
+    echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
+
+# Download examples
+    cd / && \
+    apt-get install -y --no-install-recommends subversion && \
+    svn checkout https://github.com/uber/horovod/trunk/examples && \
+    rm -rf /examples/.svn
+
+# Install sciml-bench
+    cd /sciml-benchmarks && pip install .
+
+%runscript
+    sciml-bench $@

+ 3 - 0
workflows/sciml-bench/singularity_config_files/sciml-config-builder.sh

@@ -0,0 +1,3 @@
+{ (envsubst < sciml-config-header.txt) & (cat sciml-config-body.txt); } > tmp_file.def 2>/dev/null
+sudo -E singularity build sciml-bench-with-config.sif tmp_file.def
+rm tmp_file.def

+ 2 - 0
workflows/sciml-bench/singularity_config_files/sciml-config-header.txt

@@ -0,0 +1,2 @@
+Bootstrap: localimage
+From: ${OMPI4_CONTAINER}