소스 검색

Merge pull request #268 from blesson-james/devel

Issue #261: Fix for slurm jobs not getting submitted
Lucas A. Wilson 4 년 전
부모
커밋
e626cdc524

+ 40 - 0
roles/common/tasks/main.yml

@@ -84,3 +84,43 @@
   include_tasks: amd.yml
   when: ansible_local.inventory.amd_gpu > 0
   tags: install
+
+- name: Get the hostname
+  command: hostname
+  register: machine_hostname
+  changed_when: true
+
+- name: Set facts for node hostname and ip
+  set_fact:
+    node_ip: "{{ inventory_hostname }}"
+    node_hostname: "{{ machine_hostname.stdout }}"
+
+- name: Add host name in hosts file
+  lineinfile:
+    dest: "{{ hosts_file_dest }}"
+    line: "{{ inventory_hostname }} {{ machine_hostname.stdout }}"
+    state: present
+    create: yes
+    mode: "{{ hosts_file_mode }}"
+
+- name: Add compute hosts info in manager node hosts file
+  lineinfile:
+    dest: "{{ hosts_file_dest }}"
+    line: "{{ hostvars[item].node_ip }} {{ hostvars[item].node_hostname }}"
+    state: present
+    create: yes
+    mode: "{{ hosts_file_mode }}"
+  with_items:
+    - "{{ groups['compute'] }}"
+  when: "'manager' in group_names"
+
+- name: Add manager hosts info in compute node hosts file
+  lineinfile:
+    dest: "{{ hosts_file_dest }}"
+    line: "{{ hostvars[item].node_ip }} {{ hostvars[item].node_hostname }}"
+    state: present
+    create: yes
+    mode: "{{ hosts_file_mode }}"
+  with_items:
+    - "{{ groups['manager'] }}"
+  when: "'compute' in group_names"

+ 3 - 0
roles/common/vars/main.yml

@@ -70,3 +70,6 @@ nvidia_packages:
 
 daemon_file_dest: /etc/docker/
 daemon_file_mode: 0644
+
+hosts_file_dest: "/etc/hosts"
+hosts_file_mode: "0644"

+ 2 - 0
roles/slurm_common/files/slurm.conf

@@ -18,6 +18,7 @@ SlurmUser=
 #SlurmdUser=root
 SlurmctldPort=
 SlurmdPort=
+SrunPortRange=
 AuthType=auth/munge
 #JobCredentialPrivateKey=
 #JobCredentialPublicCertificate=
@@ -56,6 +57,7 @@ InactiveLimit=0
 MinJobAge=300
 KillWait=30
 Waittime=0
+MessageTimeout=60
 #
 # SCHEDULING
 SchedulerType=sched/backfill

+ 6 - 8
roles/slurm_common/tasks/main.yml

@@ -26,14 +26,6 @@
     backup: yes
     mode: "{{ common_mode }}"
 
-- name: Add host name in hosts file
-  lineinfile:
-    dest: "{{ hosts_dest }}"
-    line: "{{ inventory_hostname }} {{ host_name.stdout }}"
-    state: present
-    create: yes
-    mode: "{{ common_mode }}"
-
 - name: Install packages for slurm
   package:
     name: "{{ item }}"
@@ -86,6 +78,12 @@
     regexp: "SlurmdPort="
     line: "SlurmdPort={{ slurmd_port }}"
 
+- name: Add srun port range
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SrunPortRange="
+    line: "SrunPortRange={{ srun_port_range }}"
+
 - name: Add spool path
   lineinfile:
     path: "{{ slurm_confpth }}"

+ 1 - 0
roles/slurm_common/vars/main.yml

@@ -35,6 +35,7 @@ slurm_confpth: "/etc/slurm/slurm.conf"
 slurm_user: "slurm"
 slurmctld_port: "6817"
 slurmd_port: "6818"
+srun_port_range: "60001-63000"
 acct_port: "6819"
 slurm_uid: "6001"
 slurm_logpth: "/var/log/slurm/"

+ 1 - 0
roles/slurm_manager/tasks/main.yml

@@ -145,6 +145,7 @@
     - "{{ tcp_port1 }}"
     - "{{ tcp_port2 }}"
     - "{{ tcp_port3 }}"
+    - "{{ tcp_port4 }}"
     - "{{ udp_port3 }}"
     - "{{ udp_port1 }}"
     - "{{ udp_port2 }}"

+ 1 - 0
roles/slurm_manager/vars/main.yml

@@ -56,6 +56,7 @@ rpm_loop: "/rpmbuild/RPMS/x86_64/*.rpm"
 tcp_port1: "6817/tcp"
 tcp_port2: "6818/tcp"
 tcp_port3: "6819/tcp"
+tcp_port4: "60001-63000/tcp"
 udp_port1: "6817/udp"
 udp_port2: "6818/udp"
 udp_port3: "6819/udp"

+ 0 - 10
roles/slurm_start_services/tasks/main.yml

@@ -25,16 +25,6 @@
     dest: "{{ slurm_confpth }}"
     mode: "{{ slurm_mode }}"
 
-- name: Add compute hosts info in hosts file
-  lineinfile:
-    dest: "{{ hosts_dest }}"
-    line: "{{ hostvars[item].compute_host }} {{ hostvars[item].compute_ip }}"
-    state: present
-    create: yes
-    mode: "{{ common_mode }}"
-  with_items:
-    - "{{ groups['compute'] }}"
-
 - name: Enable slurmdbd on manager
   systemd:
     name: slurmdbd