sciml-bench-cu11-tf27-mlnx.def 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. Bootstrap: localimage
  2. From: ompi4-cu11-mlnx.sif
  3. %files
  4. # -----------------------------------------------------------------------------------
  5. requirements.txt /sciml-benchmarks/requirements.txt
  6. MANIFEST.in /sciml-benchmarks/MANIFEST.in
  7. setup.py /sciml-benchmarks/setup.py
  8. doc /sciml-benchmarks/doc
  9. sciml_bench /sciml-benchmarks/sciml_bench
  10. %environment
  11. # -----------------------------------------------------------------------------------
  12. export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
  13. export LC_ALL=C
  14. export HOROVOD_GPU_ALLREDUCE=NCCL
  15. export HOROVOD_GPU_ALLGATHER=MPI
  16. export HOROVOD_GPU_BROADCAST=MPI
  17. #export HOROVOD_NCCL_HOME=/usr/local/cuda/nccl
  18. #export HOROVOD_NCCL_INCLUDE=/usr/local/cuda/nccl/include
  19. #export HOROVOD_NCCL_LIB=/usr/local/cuda/nccl/lib
  20. export PYTHON_VERSION=3.8
  21. export TENSORFLOW_VERSION=2.7.0
  22. export PYTORCH_VERSION=1.10.0+cu113
  23. %post
  24. # -----------------------------------------------------------------------------------
  25. # this will install all necessary packages and prepare the container
  26. # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
  27. # Python 2.7 or 3.5 is supported by Ubuntu Xenial out of the box
  28. export PYTHON_VERSION=3.8
  29. export TENSORFLOW_VERSION=2.7.0
  30. export PYTORCH_VERSION=1.10.0+cu113
  31. echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
  32. apt-get -y update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
  33. build-essential \
  34. cmake \
  35. git \
  36. curl \
  37. vim \
  38. wget \
  39. ca-certificates \
  40. libjpeg-dev \
  41. libpng-dev \
  42. python${PYTHON_VERSION} \
  43. python${PYTHON_VERSION}-dev
  44. ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python
  45. curl -O https://bootstrap.pypa.io/get-pip.py && \
  46. python get-pip.py && \
  47. rm get-pip.py
  48. # Install TensorFlow, Keras and PyTorch
  49. pip install torch==${PYTORCH_VERSION} torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
  50. pip install mxnet-cu112 tensorflow-gpu==${TENSORFLOW_VERSION} keras h5py filelock matplotlib scikit-learn
  51. ls /usr/local/
  52. whereis nvcc
  53. export PATH="/usr/local/cuda-11.5/bin:$PATH"
  54. whereis nvcc
  55. # Install Horovod, temporarily using CUDA stubs
  56. ldconfig /usr/local/cuda-11.4/targets/x86_64-linux/lib/stubs && \
  57. HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod && \
  58. ldconfig
  59. # Set default NCCL parameters
  60. echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
  61. echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
  62. # Download examples
  63. cd / && \
  64. apt-get install -y --no-install-recommends subversion && \
  65. svn checkout https://github.com/uber/horovod/trunk/examples && \
  66. rm -rf /examples/.svn
  67. # Install sciml-bench
  68. cd /sciml-benchmarks && pip install .
  69. %runscript
  70. sciml-bench $@