Browse Source

Merge pull request #5 from alan-turing-institute/pytorch_base

Add base container for pytorch
Jim Madge 3 years ago
parent
commit
5782bb6e04

+ 22 - 0
.github/workflows/lint.yaml

@@ -44,3 +44,25 @@ jobs:
 
       - name: ShellCheck
         uses: ludeeus/action-shellcheck@1.1.0
+
+  python:
+    name: "Lint Python"
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: pip install flake8 mypy
+
+      - name: Flake8
+        run: flake8 $(find . -name '*.py')
+
+      - name: Mypy
+        run: mypy --strict $(find . -name '*.py')

+ 17 - 0
base_containers/pytorch/pytorch_cu.def.template

@@ -0,0 +1,17 @@
+BootStrap: library
+From: ubuntu:20.04
+
+%post
+    apt-get -y update
+
+    # Add universe repository (necessary for python3-pip)
+    apt-get -y install software-properties-common
+    add-apt-repository -y -u universe
+
+    # Install python packages
+    apt-get -y install python3 python3-pip
+
+    apt-get clean
+
+    # Install python dependencies
+    pip3 install --no-cache-dir torch==$torch_version torchvision==$torchvision_version torchaudio==$torchaudio_version -f $find_links

+ 64 - 0
base_containers/pytorch/template.py

@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+import argparse
+from string import Template
+
+CUDA = {
+    '10.2': {
+        'torch_version': '1.9.0+cu102',
+        'torchvision_version': '0.10.0+cu102',
+        'torchaudio_version': '0.9.0',
+        'find_links': 'https://download.pytorch.org/whl/torch_stable.html'
+    },
+    '11.1': {
+        'torch_version': '1.9.0+cu111',
+        'torchvision_version': '0.10.0+cu111',
+        'torchaudio_version': '0.9.0',
+        'find_links': 'https://download.pytorch.org/whl/torch_stable.html'
+    },
+    '11.3': {
+        'torch_version': '1.10.2+cu113',
+        'torchvision_version': '0.11.3+cu113',
+        'torchaudio_version': '0.10.2+cu113',
+        'find_links': (
+            'https://download.pytorch.org/whl/cu113/torch_stable.html'
+        )
+    }
+}
+
+
+def render(cuda_version: str) -> str:
+    with open('pytorch_cu.def.template', 'r', encoding='utf8') as f:
+        template = Template(f.read())
+
+    return template.substitute(**CUDA[cuda_version])
+
+
+def write_def(cuda_version: str, text: str) -> None:
+    file_name = f'pytorch_cu_{cuda_version}.def'
+    with open(file_name, 'w', encoding='utf8') as f:
+        f.write(text)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description='Template Pytorch definition files'
+    )
+    parser.add_argument(
+        'cuda',
+        help='CUDA version',
+        type=str,
+        choices=['all'] + list(CUDA.keys())
+    )
+
+    args = parser.parse_args()
+
+    if args.cuda == 'all':
+        for cuda_version in CUDA.keys():
+            write_def(cuda_version, render(cuda_version))
+    else:
+        write_def(args.cuda, render(args.cuda))
+
+
+if __name__ == '__main__':
+    main()

+ 20 - 15
workflows/pytorch_GAN_zoo/README.md

@@ -4,8 +4,8 @@ This example builds a singularity container for [Facebook Research's PyTorch GAN
 Zoo](https://github.com/facebookresearch/pytorch_GAN_zoo).
 
 The singularity container will allow you to call all the scripts from the
-project and includes are requirements. The container supports CUDA versions
-10.1, 10.2 and 11.1 on the host.
+project and includes are requirements. The container supports CUDA version 11.1
+on the host.
 
 ## Building
 
@@ -36,16 +36,23 @@ singularity exec pytorch_GAN_zoo.sif eval.py
 Any flags or command line arguments can be declared after the script name.
 
 When training, you will need to supply the `--nv` flag to singularity so that
-the host GPU may be used. You will also need to select a singularity app, using
-the `--app` flag to select the appropriate CUDA version. The available apps are
-`cu101`, `cu102`, and `cu111` for CUDA 10.1, 10.2 and 11.1 respectively.
+the host GPU may be used.
 
-For example, to pre-process the dtd dataset and train a PGAN model on a host
-with CUDA 10.2 you could run the following commands.
+### Multiple GPUs
+
+PyTorch GAN zoo natively supports [parallelisation across multiple
+GPUs](https://github.com/facebookresearch/pytorch_GAN_zoo/issues/57). The
+devices to use can be selected using the `CUDA_VISIBLE_DEVICES` environment
+variable. CUDA compatible GPUs are numbered from zero. For example, to use the
+first and third CUDA accelerators you would set `CUDA_VISIBLE_DEVICES=0,2`
+
+To pass this environment variable to singularity the `--env-file` flag must be
+used as [passing environment variables with commas is not supported by the
+`--env` flag](https://github.com/apptainer/singularity/issues/6088).
 
 ```bash
-singularity exec --app cu102 pytorch_GAN_zoo.sif datasets.py dtd <path to dtd dataset>/images/
-singularity exec --nv --app cu102 pytorch_GAN_zoo.sif train.py PGAN -c config_dtd.json --restart --no_vis -n dtd
+echo 'CUDA_VISIBLE_DEVICES=0,1' > env.txt
+singularity exec --env-file env.txt pytorch_GAN_zoo.sif ...
 ```
 
 ### Models
@@ -60,16 +67,14 @@ In each example the `--restart` flag is used so that checkpoints are
 periodically written during the training. The `--no_vis` flag is used to disable
 visdom visualisations.
 
-As above, these examples assume the host has CUDA 10.2 installed.
-
 #### DTD
 
 The DTD dataset requires no preprocessing, so the datasets script simply creates
 a configuration file.
 
 ```bash
-singularity exec --app cu102 pytorch_GAN_zoo.sif datasets.py dtd <path to dtd>/images
-singularity exec --nv --app cu102 pytorch_GAN_zoo.sif train.py PGAN -c config_dtd.json --restart --no_vis -n dtd
+singularity exec pytorch_GAN_zoo.sif datasets.py dtd <path to dtd>/images
+singularity exec pytorch_GAN_zoo.sif train.py PGAN -c config_dtd.json --restart --no_vis -n dtd
 ```
 
 Where `<path to dtd>` is the path of the directory extracted from the dtd
@@ -82,8 +87,8 @@ A processed dataset will be written to a directory delcared using the `-o` flag,
 `cifar-10` n this example.
 
 ```bash
-singularity exec --app cu102 pytorch_GAN_zoo.sif datasets.py cifar10 <path to cifar-10> -o cifar10
-singularity exec --nv --app cu102 pytorch_GAN_zoo.sif train.py -c config_cifar10.json --restart --no_vis -n cifar10
+singularity exec pytorch_GAN_zoo.sif datasets.py cifar10 <path to cifar-10> -o cifar10
+singularity exec pytorch_GAN_zoo.sif train.py -c config_cifar10.json --restart --no_vis -n cifar10
 ```
 
 Where `<path to cifar-10>` is the path of the directory containing the pickle

+ 18 - 1
workflows/pytorch_GAN_zoo/build.sh

@@ -1,9 +1,26 @@
-#!/bin/sh
+#!/bin/bash
 
 _UID=$(id -u)
 DEF_FILE="pytorch_GAN_zoo.def"
 SIF_FILE="pytorch_GAN_zoo.sif"
 
+TORCH_DEF_FILE="pytorch_cu_11.1.def"
+TORCH_SIF_FILE="pytorch_cu_11.1.sif"
+
+pushd ../../base_containers/pytorch/ || exit
+if ! [ -f $TORCH_SIF_FILE ]; then
+    if ! [ -f $TORCH_DEF_FILE ]; then
+        ./template.py 11.1
+    fi
+
+    if [ "$_UID" = 0 ]; then
+        singularity build $TORCH_SIF_FILE $TORCH_DEF_FILE
+    else
+        singularity build --fakeroot $TORCH_SIF_FILE $TORCH_DEF_FILE
+    fi
+fi
+
+popd || exit
 if [ "$_UID" = 0 ]; then
     singularity build $SIF_FILE $DEF_FILE
 else

+ 2 - 46
workflows/pytorch_GAN_zoo/pytorch_GAN_zoo.def

@@ -1,5 +1,5 @@
-BootStrap: library
-From: ubuntu:20.04
+BootStrap: localimage
+From: ../../base_containers/pytorch/pytorch_cu_11.1.sif
 
 # Global settings
 %files
@@ -29,47 +29,3 @@ From: ubuntu:20.04
     # Install python dependencies
     pip3 install --no-cache-dir -r requirements.txt
     pip3 install --no-cache-dir imageio  # For image processing
-
-
-# CUDA 11.1 app
-%apphelp cu111
-    Cuda 11.1 support, torch 1.9.0, torchvision 0.10.0, torchaudio 0.9.0
-
-%appinstall cu111
-    python3 -m venv --system-site-packages ./venv_cu111
-    . ./venv_cu111/bin/activate
-    pip3 install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
-    deactivate
-
-%appenv cu111
-    PATH="/scif/apps/cu111/venv_cu111/bin:$PATH"
-    export PATH
-
-# CUDA 10.2 app
-%apphelp cu102
-    Cuda 10.2 support, torch 1.9.0, torchvision 0.10.0, torchaudio 0.9.0
-
-%appinstall cu102
-    python3 -m venv --system-site-packages ./venv_cu102
-    . ./venv_cu102/bin/activate
-    pip3 install --no-cache-dir torch torchvision torchaudio
-    deactivate
-
-%appenv cu102
-    PATH="/scif/apps/cu102/venv_cu102/bin:$PATH"
-    export PATH
-
-
-# CUDA 10.1 app
-%apphelp cu101
-    Cuda 10.1 support, torch 1.7.1, torchvision 0.8.2, torchaudio 0.7.2
-
-%appinstall cu101
-    python3 -m venv --system-site-packages ./venv_cu101
-    . ./venv_cu101/bin/activate
-    pip3 install --no-cache-dir torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
-    deactivate
-
-%appenv cu101
-    PATH="/scif/apps/cu101/venv_cu101/bin:$PATH"
-    export PATH