Browse Source

Issue #270: Fix for k8s pods DNS issue

Signed-off-by: blesson-james <blesson_james@Dellteam>
Lucas A. Wilson 4 years ago
parent
commit
f2d7a42bbb

+ 6 - 1
omnia_config.yml

@@ -21,4 +21,9 @@ mariadb_password: "password"
 # Kubernetes SDN network.
 # It can either be "calico" or "flannel".
 # Default value assigned is "calico".
-k8s_cni: "calico"
+k8s_cni: "calico"
+
+# Kubernetes pod network CIDR.
+# Default value is "10.244.0.0/16"
+# Make sure this value does not overlap with any of the host networks.
+k8s_pod_network_cidr: "10.244.0.0/16"

+ 1 - 1
platforms/roles/jupyterhub/vars/main.yml

@@ -23,4 +23,4 @@ helm_chart_version: 0.9.0
 
 timeout_min_sec: 60m
 
-jupyterhub_namespace: default
+jupyterhub_namespace: jupyterhub

+ 11 - 1
roles/cluster_validation/tasks/fetch_password.yml

@@ -54,7 +54,8 @@
   register: input_config_check
   when:
     - mariadb_password | length < 1 or
-      k8s_cni | length < 1
+      k8s_cni | length < 1 or
+      k8s_pod_network_cidr | length < 1
 
 - name: Assert mariadb_password
   assert:
@@ -74,10 +75,19 @@
     success_msg: "{{ success_msg_k8s_cni }}"
     fail_msg: "{{ fail_msg_k8s_cni }}"
 
+- name: Assert kubernetes pod network CIDR
+  assert:
+    that:
+      - k8s_pod_network_cidr | length > 9
+      - '"/" in k8s_pod_network_cidr '
+    success_msg: "{{ success_msg_k8s_pod_network_cidr }}"
+    fail_msg: "{{ fail_msg_k8s_pod_network_cidr }}"
+
 - name: Save input variables from file
   set_fact:
     db_password: "{{ mariadb_password }}"
     k8s_cni: "{{ k8s_cni }}"
+    k8s_pod_network_cidr: "{{ k8s_pod_network_cidr }}"
   no_log: True
 
 - name: Encrypt input config file

+ 2 - 0
roles/cluster_validation/vars/main.yml

@@ -21,6 +21,8 @@ fail_msg_mariadb_password: "maria_db password not given in correct format."
 success_msg_mariadb_password: "mariadb_password validated"
 success_msg_k8s_cni: "Kubernetes CNI Validated"
 fail_msg_k8s_cni: "Kubernetes CNI not correct."
+success_msg_k8s_pod_network_cidr: "Kubernetes pod network cidr validated"
+fail_msg_k8s_pod_network_cidr: "Kubernetes pod network cidr not given in correct format"
 
 #Usage: validations.yml
 skip_tag_fail_msg: "Can't skip both slurm and kubernetes"

+ 40 - 0
roles/common/tasks/main.yml

@@ -84,3 +84,43 @@
   include_tasks: amd.yml
   when: ansible_local.inventory.amd_gpu > 0
   tags: install
+
+- name: Get the hostname
+  command: hostname
+  register: machine_hostname
+  changed_when: true
+
+- name: Set facts for node hostname and ip
+  set_fact:
+    node_ip: "{{ inventory_hostname }}"
+    node_hostname: "{{ machine_hostname.stdout }}"
+
+- name: Add host name in hosts file
+  lineinfile:
+    dest: "{{ hosts_file_dest }}"
+    line: "{{ inventory_hostname }} {{ machine_hostname.stdout }}"
+    state: present
+    create: yes
+    mode: "{{ hosts_file_mode }}"
+
+- name: Add compute hosts info in manager node hosts file
+  lineinfile:
+    dest: "{{ hosts_file_dest }}"
+    line: "{{ hostvars[item].node_ip }} {{ hostvars[item].node_hostname }}"
+    state: present
+    create: yes
+    mode: "{{ hosts_file_mode }}"
+  with_items:
+    - "{{ groups['compute'] }}"
+  when: "'manager' in group_names"
+
+- name: Add manager hosts info in compute node hosts file
+  lineinfile:
+    dest: "{{ hosts_file_dest }}"
+    line: "{{ hostvars[item].node_ip }} {{ hostvars[item].node_hostname }}"
+    state: present
+    create: yes
+    mode: "{{ hosts_file_mode }}"
+  with_items:
+    - "{{ groups['manager'] }}"
+  when: "'compute' in group_names"

+ 3 - 0
roles/common/vars/main.yml

@@ -70,3 +70,6 @@ nvidia_packages:
 
 daemon_file_dest: /etc/docker/
 daemon_file_mode: 0644
+
+hosts_file_dest: "/etc/hosts"
+hosts_file_mode: "0644"

+ 2 - 1
roles/k8s_start_manager/tasks/main.yml

@@ -44,7 +44,8 @@
   tags: init
 
 - name: Initialize kubeadm
-  command: "/bin/kubeadm init --pod-network-cidr='{{ pod_network_cidr_ip }}' --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
+  command: "/bin/kubeadm init --pod-network-cidr='{{ hostvars['127.0.0.1']['k8s_pod_network_cidr'] }}' \
+    --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
   changed_when: true
   when: "'master' not in k8s_nodes.stdout"
   register: init_output

+ 1 - 3
roles/k8s_start_manager/vars/main.yml

@@ -13,8 +13,6 @@
 #  limitations under the License.
 ---
 
-pod_network_cidr_ip: 10.244.0.0/16
-
 k8s_root_directory: /root/.kube
 
 k8s_root_directory_mode: 0755
@@ -43,4 +41,4 @@ k8s_clusterRoleBinding_file_mode: 0655
 
 calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
 
-flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
+flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml

+ 2 - 0
roles/slurm_common/files/slurm.conf

@@ -18,6 +18,7 @@ SlurmUser=
 #SlurmdUser=root
 SlurmctldPort=
 SlurmdPort=
+SrunPortRange=
 AuthType=auth/munge
 #JobCredentialPrivateKey=
 #JobCredentialPublicCertificate=
@@ -56,6 +57,7 @@ InactiveLimit=0
 MinJobAge=300
 KillWait=30
 Waittime=0
+MessageTimeout=60
 #
 # SCHEDULING
 SchedulerType=sched/backfill

+ 6 - 8
roles/slurm_common/tasks/main.yml

@@ -26,14 +26,6 @@
     backup: yes
     mode: "{{ common_mode }}"
 
-- name: Add host name in hosts file
-  lineinfile:
-    dest: "{{ hosts_dest }}"
-    line: "{{ inventory_hostname }} {{ host_name.stdout }}"
-    state: present
-    create: yes
-    mode: "{{ common_mode }}"
-
 - name: Install packages for slurm
   package:
     name: "{{ item }}"
@@ -86,6 +78,12 @@
     regexp: "SlurmdPort="
     line: "SlurmdPort={{ slurmd_port }}"
 
+- name: Add srun port range
+  lineinfile:
+    path: "{{ slurm_confpth }}"
+    regexp: "SrunPortRange="
+    line: "SrunPortRange={{ srun_port_range }}"
+
 - name: Add spool path
   lineinfile:
     path: "{{ slurm_confpth }}"

+ 1 - 0
roles/slurm_common/vars/main.yml

@@ -35,6 +35,7 @@ slurm_confpth: "/etc/slurm/slurm.conf"
 slurm_user: "slurm"
 slurmctld_port: "6817"
 slurmd_port: "6818"
+srun_port_range: "60001-63000"
 acct_port: "6819"
 slurm_uid: "6001"
 slurm_logpth: "/var/log/slurm/"

+ 1 - 0
roles/slurm_manager/tasks/main.yml

@@ -145,6 +145,7 @@
     - "{{ tcp_port1 }}"
     - "{{ tcp_port2 }}"
     - "{{ tcp_port3 }}"
+    - "{{ tcp_port4 }}"
     - "{{ udp_port3 }}"
     - "{{ udp_port1 }}"
     - "{{ udp_port2 }}"

+ 1 - 0
roles/slurm_manager/vars/main.yml

@@ -56,6 +56,7 @@ rpm_loop: "/rpmbuild/RPMS/x86_64/*.rpm"
 tcp_port1: "6817/tcp"
 tcp_port2: "6818/tcp"
 tcp_port3: "6819/tcp"
+tcp_port4: "60001-63000/tcp"
 udp_port1: "6817/udp"
 udp_port2: "6818/udp"
 udp_port3: "6819/udp"

+ 0 - 10
roles/slurm_start_services/tasks/main.yml

@@ -25,16 +25,6 @@
     dest: "{{ slurm_confpth }}"
     mode: "{{ slurm_mode }}"
 
-- name: Add compute hosts info in hosts file
-  lineinfile:
-    dest: "{{ hosts_dest }}"
-    line: "{{ hostvars[item].compute_host }} {{ hostvars[item].compute_ip }}"
-    state: present
-    create: yes
-    mode: "{{ common_mode }}"
-  with_items:
-    - "{{ groups['compute'] }}"
-
 - name: Enable slurmdbd on manager
   systemd:
     name: slurmdbd