sciml-bench-one-container.def 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. Bootstrap: docker
  2. From: ubuntu:18.04
  3. #Stage: spython-base
  4. %files
  5. # -----------------------------------------------------------------------------------
  6. sciml-bench/requirements.txt /sciml-benchmarks/requirements.txt
  7. sciml-bench/MANIFEST.in /sciml-benchmarks/MANIFEST.in
  8. sciml-bench/setup.py /sciml-benchmarks/setup.py
  9. sciml-bench/doc /sciml-benchmarks/doc
  10. sciml-bench/sciml_bench /sciml-benchmarks/sciml_bench
  11. %environment
  12. # -----------------------------------------------------------------------------------
  13. export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
  14. export LC_ALL=C
  15. export HOROVOD_GPU_ALLREDUCE=NCCL
  16. export HOROVOD_GPU_ALLGATHER=MPI
  17. export HOROVOD_GPU_BROADCAST=MPI
  18. export PYTHON_VERSION=3.8
  19. export TENSORFLOW_VERSION=2.3.0
  20. export PYTORCH_VERSION=1.10.0+cu113
  21. %post
  22. # -----------------------------------------------------------------------------------
  23. export SINGULARITY_BINDPATH=$SINGULARITY_BINDPATH,$(echo /usr/bin/ | sed -e 's/ /,/g')
  24. MLNX_ARCH=ubuntu18.04-x86_64
  25. MLNX_VERSION=5.2-1.0.4.0
  26. # Python 3.7 is supported by Ubuntu Bionic out of the box
  27. python=3.7
  28. PYTHON_VERSION=${python}
  29. # Set default shell to /bin/bash
  30. #SHELL ["/bin/bash", "-cu"]
  31. TZ="Europe/London"
  32. RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
  33. apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
  34. build-essential \
  35. cmake \
  36. g++-7 \
  37. git \
  38. curl \
  39. vim \
  40. wget \
  41. ca-certificates \
  42. python${PYTHON_VERSION} \
  43. python${PYTHON_VERSION}-dev \
  44. python${PYTHON_VERSION}-distutils \
  45. libpython-stdlib \
  46. python \
  47. udev \
  48. automake \
  49. gfortran \
  50. autotools-dev \
  51. chrpath \
  52. pkg-config \
  53. libnl-3-dev \
  54. tcl \
  55. pciutils \
  56. tk \
  57. libnl-route-3-dev \
  58. libltdl-dev \
  59. bison \
  60. autoconf \
  61. flex \
  62. graphviz \
  63. libssl1.0.0 \
  64. kmod \
  65. debhelper \
  66. libgfortran4 \
  67. ethtool \
  68. swig \
  69. lsof \
  70. libnl-route-3-200 \
  71. m4 \
  72. libnl-3-200 \
  73. dpatch \
  74. libnuma-dev \
  75. iputils-ping \
  76. iproute2 \
  77. apt-utils
  78. ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python
  79. curl -O https://bootstrap.pypa.io/get-pip.py && \
  80. python get-pip.py && \
  81. rm get-pip.py
  82. wget http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_VERSION}/MLNX_OFED_LINUX-${MLNX_VERSION}-${MLNX_ARCH}.tgz && \
  83. tar -xzvf MLNX_OFED_LINUX-${MLNX_VERSION}-${MLNX_ARCH}.tgz --no-same-owner && \
  84. cd MLNX_OFED_LINUX-${MLNX_VERSION}-${MLNX_ARCH} && \
  85. ./mlnxofedinstall --user-space-only --without-fw-update --skip-repo --force && \
  86. cd .. && \
  87. rm -rf MLNX_OFED-LINUX-${MLNX_VERSION}-${MLNX_ARCH} && \
  88. rm -rf *.tgz
  89. # Install Open MPI version 4.1
  90. wget https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-4.1.0.tar.gz && \
  91. tar zxf openmpi-4.1.0.tar.gz && \
  92. cd openmpi-4.1.0 && \
  93. ./configure --enable-mpirun-prefix-by-default \
  94. --enable-shared \
  95. --without-verbs && \
  96. make -j $(nproc) all && \
  97. make install && \
  98. ldconfig && \
  99. rm -rf /tmp/openmpi
  100. cd /
  101. pip --no-cache-dir --disable-pip-version-check install mpi4py
  102. LD_LIBRARY_PATH=/usr/local/lib/:/usr/local/lib/openmpi:$LD_LIBRARY_PATH
  103. # --------------------------------------------------------------------------
  104. # this will install all necessary packages and prepare the container
  105. # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
  106. # Python 2.7 or 3.5 is supported by Ubuntu Xenial out of the box
  107. export PYTHON_VERSION=3.8
  108. export TENSORFLOW_VERSION=2.3.0
  109. export PYTORCH_VERSION=1.10.0+cu113
  110. wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
  111. mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
  112. apt-get update && apt-get install -y gnupg
  113. apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
  114. apt-get install -y software-properties-common
  115. apt-get update
  116. add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
  117. apt-get update
  118. export DEBIAN_FRONTEND=noninteractive
  119. apt-get -y install cuda
  120. apt-get update && apt-get install -y libcudnn8 libcudnn8-dev
  121. apt install libnccl2 libnccl-dev
  122. # echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
  123. apt-get -y update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
  124. build-essential \
  125. cmake \
  126. git \
  127. curl \
  128. vim \
  129. wget \
  130. ca-certificates \
  131. libjpeg-dev \
  132. libpng-dev \
  133. python${PYTHON_VERSION} \
  134. python${PYTHON_VERSION}-dev
  135. ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python
  136. curl -O https://bootstrap.pypa.io/get-pip.py && \
  137. python get-pip.py && \
  138. rm get-pip.py
  139. # Install TensorFlow, Keras and PyTorch
  140. pip install torch==${PYTORCH_VERSION} torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
  141. pip install mxnet-cu112 tensorflow-gpu==${TENSORFLOW_VERSION} keras h5py filelock matplotlib scikit-learn
  142. export PATH="/usr/local/cuda-11.5/bin:$PATH"
  143. # Install Horovod, temporarily using CUDA stubs
  144. ldconfig /usr/local/cuda-11.5/targets/x86_64-linux/lib/stubs && \
  145. HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod && \
  146. ldconfig
  147. # Set default NCCL parameters
  148. echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
  149. echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
  150. # Download examples
  151. cd / && \
  152. apt-get install -y --no-install-recommends subversion && \
  153. svn checkout https://github.com/uber/horovod/trunk/examples && \
  154. rm -rf /examples/.svn
  155. # Install sciml-bench
  156. cd /sciml-benchmarks && pip install .
  157. %environment
  158. %runscript
  159. sciml-bench $@