sciml-bench-cu11-mlnx.def 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. Bootstrap: localimage
  2. From: ompi4-cu11.sif
  3. %files
  4. # -----------------------------------------------------------------------------------
  5. sciml-bench/requirements.txt /sciml-benchmarks/requirements.txt
  6. sciml-bench/MANIFEST.in /sciml-benchmarks/MANIFEST.in
  7. sciml-bench/setup.py /sciml-benchmarks/setup.py
  8. sciml-bench/doc /sciml-benchmarks/doc
  9. sciml-bench/sciml_bench /sciml-benchmarks/sciml_bench
  10. %environment
  11. # -----------------------------------------------------------------------------------
  12. export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
  13. export LC_ALL=C
  14. export HOROVOD_GPU_ALLREDUCE=NCCL
  15. export HOROVOD_GPU_ALLGATHER=MPI
  16. export HOROVOD_GPU_BROADCAST=MPI
  17. #export HOROVOD_NCCL_HOME=/usr/local/cuda/nccl
  18. #export HOROVOD_NCCL_INCLUDE=/usr/local/cuda/nccl/include
  19. #export HOROVOD_NCCL_LIB=/usr/local/cuda/nccl/lib
  20. export PYTHON_VERSION=3.8
  21. export TENSORFLOW_VERSION=2.7.0
  22. export PYTORCH_VERSION=1.10.0+cu113
  23. %post
  24. # -----------------------------------------------------------------------------------
  25. # this will install all necessary packages and prepare the container
  26. # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
  27. # Python 2.7 or 3.5 is supported by Ubuntu Xenial out of the box
  28. export PYTHON_VERSION=3.8
  29. export TENSORFLOW_VERSION=2.7.0
  30. export PYTORCH_VERSION=1.10.0+cu113
  31. echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
  32. apt-get -y update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
  33. build-essential \
  34. cmake \
  35. git \
  36. curl \
  37. vim \
  38. wget \
  39. ca-certificates \
  40. libjpeg-dev \
  41. libpng-dev \
  42. python${PYTHON_VERSION} \
  43. python${PYTHON_VERSION}-dev
  44. ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python
  45. curl -O https://bootstrap.pypa.io/get-pip.py && \
  46. python get-pip.py && \
  47. rm get-pip.py
  48. # Install TensorFlow, Keras and PyTorch
  49. pip install torch==${PYTORCH_VERSION} torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
  50. pip install mxnet-cu112 tensorflow-gpu==${TENSORFLOW_VERSION} keras h5py filelock matplotlib scikit-learn
  51. export PATH="/usr/local/cuda-11.5/bin:$PATH"
  52. # Install Horovod, temporarily using CUDA stubs
  53. ldconfig /usr/local/cuda-11.4/targets/x86_64-linux/lib/stubs && \
  54. HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod && \
  55. ldconfig
  56. # Set default NCCL parameters
  57. echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
  58. echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
  59. # Download examples
  60. cd / && \
  61. apt-get install -y --no-install-recommends subversion && \
  62. svn checkout https://github.com/uber/horovod/trunk/examples && \
  63. rm -rf /examples/.svn
  64. # Install sciml-bench
  65. cd /sciml-benchmarks && pip install .
  66. %runscript
  67. sciml-bench $@