1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- Bootstrap: localimage
- From: ompi4-cu11-mlnx.sif
- %files
- # -----------------------------------------------------------------------------------
- requirements.txt /sciml-benchmarks/requirements.txt
- MANIFEST.in /sciml-benchmarks/MANIFEST.in
- setup.py /sciml-benchmarks/setup.py
- doc /sciml-benchmarks/doc
- sciml_bench /sciml-benchmarks/sciml_bench
- %environment
- # -----------------------------------------------------------------------------------
- export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
- export LC_ALL=C
- export HOROVOD_GPU_ALLREDUCE=NCCL
- export HOROVOD_GPU_ALLGATHER=MPI
- export HOROVOD_GPU_BROADCAST=MPI
- #export HOROVOD_NCCL_HOME=/usr/local/cuda/nccl
- #export HOROVOD_NCCL_INCLUDE=/usr/local/cuda/nccl/include
- #export HOROVOD_NCCL_LIB=/usr/local/cuda/nccl/lib
- export PYTHON_VERSION=3.8
- export TENSORFLOW_VERSION=2.7.0
- export PYTORCH_VERSION=1.10.0+cu113
- %post
- # -----------------------------------------------------------------------------------
- # this will install all necessary packages and prepare the container
- # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
- # Python 2.7 or 3.5 is supported by Ubuntu Xenial out of the box
- export PYTHON_VERSION=3.8
- export TENSORFLOW_VERSION=2.7.0
- export PYTORCH_VERSION=1.10.0+cu113
- echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
- apt-get -y update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
- build-essential \
- cmake \
- git \
- curl \
- vim \
- wget \
- ca-certificates \
- libjpeg-dev \
- libpng-dev \
- python${PYTHON_VERSION} \
- python${PYTHON_VERSION}-dev
- ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python
- curl -O https://bootstrap.pypa.io/get-pip.py && \
- python get-pip.py && \
- rm get-pip.py
- # Install TensorFlow, Keras and PyTorch
- pip install torch==${PYTORCH_VERSION} torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
- pip install mxnet-cu112 tensorflow-gpu==${TENSORFLOW_VERSION} keras h5py filelock matplotlib scikit-learn
-
- ls /usr/local/
- whereis nvcc
- export PATH="/usr/local/cuda-11.5/bin:$PATH"
- whereis nvcc
- # Install Horovod, temporarily using CUDA stubs
- ldconfig /usr/local/cuda-11.4/targets/x86_64-linux/lib/stubs && \
- HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod && \
- ldconfig
- # Set default NCCL parameters
- echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
- echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
- # Download examples
- cd / && \
- apt-get install -y --no-install-recommends subversion && \
- svn checkout https://github.com/uber/horovod/trunk/examples && \
- rm -rf /examples/.svn
- # Install sciml-bench
- cd /sciml-benchmarks && pip install .
- %runscript
- sciml-bench $@
|