ソースを参照

refactor playbooks

Signed-off-by: John Lockman <jlockman3@gmail.com>
John Lockman 5 年 前
コミット
f320faeeaf

BIN
slurm/roles/slurm-common/files/munge.key


+ 97 - 0
slurm/roles/slurm-common/files/slurm.conf

@@ -0,0 +1,97 @@
+#
+# Example slurm.conf file. Please run configurator.html
+# (in doc/html) to build a configuration file customized
+# for your environment.
+#
+#
+# slurm.conf file generated by configurator.html.
+#
+# See the slurm.conf man page for more information.
+#
+ClusterName=friday
+ControlMachine=friday
+ControlAddr=10.0.0.1
+#BackupController=
+#BackupAddr=
+#
+SlurmUser=slurm
+#SlurmdUser=root
+SlurmctldPort=6817
+SlurmdPort=6818
+AuthType=auth/munge
+#JobCredentialPrivateKey=
+#JobCredentialPublicCertificate=
+StateSaveLocation=/var/spool/slurm/ctld
+SlurmdSpoolDir=/var/spool/slurm/
+SwitchType=switch/none
+MpiDefault=none
+SlurmctldPidFile=/var/run/slurmctld.pid
+SlurmdPidFile=/var/run/slurmd.pid
+ProctrackType=proctrack/pgid
+#PluginDir=
+#FirstJobId=
+ReturnToService=2
+#MaxJobCount=
+#PlugStackConfig=
+#PropagatePrioProcess=
+#PropagateResourceLimits=
+#PropagateResourceLimitsExcept=
+#Prolog=
+#Epilog=
+#SrunProlog=
+#SrunEpilog=
+#TaskProlog=
+#TaskEpilog=
+#TaskPlugin=
+#TrackWCKey=no
+#TreeWidth=50
+#TmpFS=
+#UsePAM=
+#
+# TIMERS
+SlurmctldTimeout=300
+SlurmdTimeout=300
+InactiveLimit=0
+MinJobAge=300
+KillWait=30
+Waittime=0
+#
+# SCHEDULING
+SchedulerType=sched/backfill
+#SchedulerAuth=
+SelectType=select/linear
+#FastSchedule=1
+PriorityType=priority/multifactor
+PriorityDecayHalfLife=14-0
+#PriorityUsageResetPeriod=14-0
+PriorityWeightFairshare=100000
+PriorityWeightAge=1000
+PriorityWeightPartition=10000
+PriorityWeightJobSize=1000
+PriorityMaxAge=14-0
+#
+# LOGGING
+SlurmctldDebug=3
+SlurmctldLogFile=/var/log/slurm/slurmctld.log
+SlurmdDebug=1
+SlurmdLogFile=/var/log/slurm/slurmd.log
+JobCompType=jobcomp/none
+#JobCompLoc=
+#
+# ACCOUNTING
+JobAcctGatherType=jobacct_gather/linux
+JobAcctGatherFrequency=30
+#
+AccountingStorageType=accounting_storage/slurmdbd
+#AccountingStorageHost=
+#AccountingStorageLoc=
+#AccountingStoragePass=
+#AccountingStorageUser=
+#
+# COMPUTE NODES
+#NodeName=linux[1-32] Procs=1 State=UNKNOWN
+#NodeName=DEFAULT Sockets=2 CoresPerSocket=20 State=UNKNOWN
+NodeName=compute000 Sockets=2 CoresPerSocket=8
+NodeName=compute[002-005] CoresPerSocket=20
+PartitionName=normal Nodes=ALL Default=YES MaxTime=INFINITE State=UP
+#PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

+ 91 - 0
slurm/roles/slurm-common/tasks/main.yaml

@@ -0,0 +1,91 @@
+---
+
+- name: install packages for slurm
+  yum: 
+    name:
+      - munge
+      - mariadb
+      - mariadb-devel
+      - python3
+    state: present
+  tags: install
+
+- name: create munge key
+  command: /usr/sbin/create-munge-key -f
+  tags: install
+
+- name: Copy munge key
+  copy:
+    src: munge.key
+    dest: /etc/munge
+    owner: munge
+    group: munge
+    mode: 0400
+  tags: install
+
+- name: Copy example Slurm Configuration - slurm.conf
+  copy:
+    src: slurm.conf
+    dest: /etc/slurm/
+    mode: 0644
+  tags: install
+
+
+- name: create SLURM Group
+  group: 
+    name: slurm 
+    state: present
+  tags: install
+
+- name: Add the user 'slurm' with uid 6001 and a primary group of 'slurm'
+  user:
+    name: slurm
+    comment: Slurm User Account
+    uid: 6001
+    group: slurm
+  tags: install
+
+- name: create SLURM log directory
+  file:
+    path: /var/log/slurm
+    state: directory
+    owner: slurm
+    group: slurm
+    mode: 0755
+    recurse: yes
+  tags: install
+
+- name: give slurm user permission to spool
+  file: 
+    path: /var/spool/slurm
+    owner: slurm
+    group: slurm
+    state: directory
+    mode: 0755
+    recurse: yes
+
+- name: give slurm user permission to slurmctld
+  file: 
+    path: /var/run/slurmctld.pid
+    owner: slurm
+    group: slurm
+    mode: 0755
+    state: touch
+
+- name: give slurm user permission to slurmd
+  file: 
+    path: /var/run/slurmd.pid
+    owner: slurm
+    group: slurm
+    mode: 0755
+    state: touch
+
+- name: start munge service
+  service:
+    name: munge 
+    state: restarted
+    enabled: yes
+  tags: install
+
+
+

+ 98 - 0
slurm/roles/slurm-master/tasks/main.yaml

@@ -0,0 +1,98 @@
+---
+
+- name: Download Slurm source
+  get_url:
+    url: "{{ slurm_url }}"
+    dest: /root/Downloads/
+    checksum: "{{ slurm_md5 }}" 
+  tags: install
+
+- name: Build SLURM RPMs
+  command: rpmbuild -ta /root/Downloads/slurm-20.02.0.tar.bz2 
+  tags: install
+
+- name: Copy RPMs to NFS share
+  copy:
+    src: "{{ item }}" 
+    dest: /home/rpms/
+  with_fileglob:
+    - /root/rpmbuild/RPMS/x86_64/slurm*20*.rpm
+  tags: install
+
+- name: Install SLURM RPMs on Master
+  yum: 
+    name: "{{ item }}"
+    #name: "{{ query('fileglob', ['/home/rpms/slurm*20*.rpm']) }}" <-- how it should work to avoid loop
+  with_fileglob:
+    - /home/rpms/slurm*20*.rpm
+  tags: install
+
+- name: Firewall Rule slurm allow 6817/tcp
+  command: firewall-cmd  --zone=internal --add-port=6817/tcp --permanent
+  tags: install
+
+- name: Firewall Rule slurm allow 6818/tcp
+  command: firewall-cmd  --zone=internal --add-port=6818/tcp --permanent
+  tags: install
+
+- name: Firewall Rule slurm allow 6819/tcp
+  command: firewall-cmd  --zone=internal --add-port=6819/tcp --permanent
+  tags: install
+
+- name: Firewall Rule slurm allow all incoming traffic on internal network
+  command: firewall-cmd --permanent --zone=internal --add-rich-rule='rule family="ipv4" source address="192.168.1.0/24" accept'
+  tags: install
+
+- name: Firewall Reload
+  command: firewall-cmd  --reload
+  tags: install
+
+
+- name: Start MariaDB 
+  service:
+    name: mariadb
+    state: restarted
+    enabled: yes
+  tags: install
+
+- name: Grant Permissions for SLURM DB
+  command: mysql -u root -e "GRANT ALL ON slurm_acct_db.* TO 'slurm'@'localhost' identified by 'password' with grant option;"
+  tags: install
+
+- name: Create slurmdbd.conf file
+  copy:
+    src: /etc/slurm/slurmdbd.conf.example
+    dest: /etc/slurm/slurmdbd.conf
+    mode: 0600
+  tags: install
+
+- name: Populate Accounting Database
+  command: slurmdbd
+  tags: install
+
+- name: Create Slurm Cluster
+  command: sacctmgr -i add cluster {{inventory_hostname}}
+  tags: install
+
+- name: Create Default Slurm Group
+  command: sacctmgr -i add account defaultgroup Cluster={{inventory_hostname}} Description="Default Account" Organization="Default Org"
+  tags: install
+
+- name: Add root to the Default Account 
+  command: sacctmgr -i add user root DefaultAccount=defaultgroup
+  tags: install
+
+- name: Start slurmctld on Master
+  service: 
+    name: slurmctld
+    state: restarted
+    enabled: yes
+  tags: install
+
+- name: Enable Slurmdbd on Master
+  service: 
+    name: slurmdbd
+    state: restarted
+    enabled: yes
+  tags: install
+

+ 9 - 0
slurm/roles/start-slurm-workers/tasks/main.yml

@@ -0,0 +1,9 @@
+---
+- name: Install SLURM RPMs on compute
+  yum:
+    name: "{{ item }}"
+    #name: "{{ query('fileglob', ['/home/rpms/slurm*20*.rpm']) }}" <-- how it should work to avoid loop
+  with_fileglob:
+    - /home/rpms/slurm*20*.rpm
+  tags: install
+

+ 23 - 0
slurm/slurm-cluster.yaml

@@ -0,0 +1,23 @@
+---
+#Playbook for installing Slurm on a cluster 
+
+#collect info from everything
+- hosts: all
+
+# Apply Common Installation and Config
+- hosts: cluster
+  gather_facts: false
+  roles:
+    - slurm-common
+
+# Apply Master Config, start services
+- hosts: master
+  gather_facts: false
+  roles:
+    - slurm-master
+
+# Start SLURM workers
+- hosts: compute
+  gather_facts: false
+  roles:
+    - start-slurm-workers

+ 18 - 0
slurm/slurm_inventory_file

@@ -0,0 +1,18 @@
+[master]
+friday
+
+[master:vars]
+slurm_url=https://download.schedmd.com/slurm/slurm-20.02.0.tar.bz2
+slurm_md5=md5:8ed2257471ff24ca213b510a4c1c3563
+
+[compute]
+compute000
+compute[002:005]
+
+
+[workers:children]
+compute
+
+[cluster:children]
+master
+workers