Browse Source

Issue #197: Fix for firewall rich rule for slurm

Signed-off-by: VishnupriyaKrish <Vishnupriya_Krishnar@Dellteam.com>
Lucas A. Wilson 4 years ago
parent
commit
dbe7ba166b

+ 4 - 4
omnia.yml

@@ -22,7 +22,7 @@
 
 - name: Gather facts from all the nodes
   hosts: all
-    
+
 - name: Apply common installation and config
   hosts: manager, compute
   gather_facts: false
@@ -56,7 +56,7 @@
   gather_facts: false
   roles:
     - k8s_nfs_server_setup
-  tags: 
+  tags:
     - kubernetes
     - nfs
 
@@ -65,7 +65,7 @@
   gather_facts: false
   roles:
     - k8s_nfs_client_setup
-  tags: 
+  tags:
     - kubernetes
     - nfs
 
@@ -143,4 +143,4 @@
       set_fact:
         ssh_to: "{{ groups['manager'] }}"
   roles:
-    - cluster_preperation
+    - cluster_preperation

+ 1 - 13
platforms/roles/kubeflow/tasks/main.yml

@@ -114,20 +114,8 @@
     regexp: 'NodePort'
     replace: 'LoadBalancer'
 
-- name: Remove cert-manager application block
-  replace:
-    path: "{{ kubeflow_config_file }}"
-    regexp: "{{ cert_manager_block }}"
-    replace: "\n"
-
-- name: Remove seldon-core-operator application block
-  replace:
-    path: "{{ kubeflow_config_file }}"
-    regexp: "{{ seldon_core_operator_block }}"
-    replace: "\n"
-
 - name: Apply kubeflow configuration
   command:
     cmd: "/usr/bin/kfctl apply -V -f '{{ kubeflow_config_file }}'"
     chdir: "{{ omnia_kubeflow_dir_path }}"
-  changed_when: true
+  changed_when: true

+ 0 - 22
platforms/roles/kubeflow/vars/main.yml

@@ -32,25 +32,3 @@ kfserving_gateway_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/kfser
 argo_yaml_file_path: "{{ omnia_kubeflow_dir_path }}/kustomize/argo/base/service.yaml"
 
 kubeflow_config_file: "{{ omnia_kubeflow_dir_path }}/kfctl_k8s_istio.v1.0.2.yaml"
-
-cert_manager_block: >
-    - kustomizeConfig:
-          overlays:
-          - self-signed
-          - application
-          parameters:
-          - name: namespace
-            value: cert-manager
-          repoRef:
-            name: manifests
-            path: cert-manager/cert-manager
-        name: cert-manager
-
-seldon_core_operator_block: >
-    - kustomizeConfig:
-          overlays:
-          - application
-          repoRef:
-            name: manifests
-            path: seldon/seldon-core-operator
-        name: seldon-core-operator

+ 2 - 2
roles/common/tasks/nvidia.yml

@@ -26,7 +26,7 @@
     enabled: yes
   tags: install
 
-- name: Add nvidia-container-runtime Repo 
+- name: Add nvidia-container-runtime Repo
   yum_repository:
     name: nvidia-container-runtime
     description:  nvidia-container-runtime
@@ -39,7 +39,7 @@
     enabled: yes
   tags: install
 
-- name: Add nvidia-docker Repo 
+- name: Add nvidia-docker Repo
   yum_repository:
     name: nvidia-docker
     description:  nvidia-docker

+ 1 - 0
roles/common/vars/main.yml

@@ -24,6 +24,7 @@ common_packages:
   - chrony
   - pciutils
   - docker-ce
+  - openssl
 
 custom_fact_dir: /etc/ansible/facts.d
 

+ 2 - 2
roles/k8s_common/tasks/main.yml

@@ -21,8 +21,8 @@
     enabled: yes
     gpgcheck: no
     repo_gpgcheck: no
-    gpgkey: 
-      - https://packages.cloud.google.com/yum/doc/yum-key.gpg 
+    gpgkey:
+      - https://packages.cloud.google.com/yum/doc/yum-key.gpg
       - https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
   tags: install
 

+ 4 - 4
roles/k8s_firewalld/tasks/main.yml

@@ -50,7 +50,7 @@
     permanent: yes
     state: enabled
   with_items: "{{ flannel_udp_ports }}"
-  when: k8s_cni == "flannel"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
   tags: firewalld
 
 - name: Open calico UDP ports on the firewall
@@ -59,7 +59,7 @@
     permanent: yes
     state: enabled
   with_items: "{{ calico_udp_ports }}"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: firewalld
 
 - name: Open calico TCP ports on the firewall
@@ -68,7 +68,7 @@
     permanent: yes
     state: enabled
   with_items: "{{ calico_tcp_ports }}"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: firewalld
 
 - name: Reload firewalld
@@ -81,4 +81,4 @@
     name: firewalld
     state: stopped
     enabled: no
-  tags: firewalld
+  tags: firewalld

+ 1 - 4
roles/k8s_firewalld/vars/main.yml

@@ -13,9 +13,6 @@
 #  limitations under the License.
 ---
 
-# Kubernetes SDN network
-k8s_cni: calico
-
 # Master nodes firewall ports
 k8s_master_ports:
   - 6443
@@ -39,4 +36,4 @@ calico_tcp_ports:
 # Flannel CNI firewall ports
 flannel_udp_ports:
   - 8285
-  - 8472
+  - 8472

+ 14 - 3
roles/k8s_nfs_client_setup/tasks/main.yml

@@ -19,6 +19,15 @@
     state: present
   tags: nfs_client
 
+- name: Check mounted share
+  shell: mount | grep nfs
+  changed_when: false
+  args:
+    warn: false
+  register: mounted_share
+  ignore_errors: True
+  tags: nfs_client
+
 - name: Creating directory to mount NFS Share
   file:
     path: "{{ nfs_mnt_dir }}"
@@ -27,14 +36,16 @@
   tags: nfs_client
 
 - name: Mounting NFS Share
-  command: "mount {{ groups['manager'] }}:{{ nfs_mnt_dir }} {{ nfs_mnt_dir }}"
+  command: "mount {{ groups['manager'][0] }}:{{ nfs_mnt_dir }} {{ nfs_mnt_dir }}"
   changed_when: true
   args:
     warn: false
+  when: groups['manager'][0] not in mounted_share.stdout
   tags: nfs_client
 
 - name: Configuring Automount NFS Shares on reboot
   lineinfile:
     path: "{{ fstab_file_path }}"
-    line: "{{ groups['manager'] }}:{{ nfs_mnt_dir }}     {{ nfs_mnt_dir }}  nfs     nosuid,rw,sync,hard,intr 0 0"
-  tags: nfs_client
+    line: "{{ groups['manager'][0] }}:{{ nfs_mnt_dir }}     {{ nfs_mnt_dir }}  nfs     nosuid,rw,sync,hard,intr 0 0"
+  when: groups['manager'][0] not in mounted_share.stdout
+  tags: nfs_client

+ 22 - 5
roles/k8s_start_manager/tasks/main.yml

@@ -22,9 +22,17 @@
   setup:
     filter: ansible_default_ipv4.address
 
+- name: Check K8s nodes status
+  command: kubectl get nodes
+  changed_when: false
+  ignore_errors: True
+  register: k8s_nodes
+  tags: init
+
 - name: Initialize kubeadm
   command: "/bin/kubeadm init --pod-network-cidr='{{ pod_network_cidr_ip }}' --apiserver-advertise-address='{{ ansible_default_ipv4.address }}'"
   changed_when: true
+  when: "'master' not in k8s_nodes.stdout"
   register: init_output
   tags: init
 
@@ -74,6 +82,7 @@
     token:  "{{ K8S_TOKEN.stdout }}"
     hash:   "{{ K8S_MANAGER_CA_HASH.stdout }}"
     ip:     "{{ ansible_default_ipv4.address }}"
+    k8s_nodes:  "{{ k8s_nodes.stdout }}"
   tags: init
 
 - name: Print k8s token
@@ -96,12 +105,12 @@
 
 - name: Setup Calico SDN network
   command: "kubectl apply -f '{{ calico_yml_url }}'"
-  when: k8s_cni == "calico"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: init
 
 - name: Setup Flannel SDN network
   command: "kubectl apply -f '{{ flannel_yml_url }}'"
-  when: k8s_cni == "flannel"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
   tags: init
 
 - name: Create yaml repo for setup
@@ -120,9 +129,16 @@
     mode: "{{ k8s_service_account_file_mode }}"
   tags: init
 
+- name: Check K8s service accounts status
+  command: "kubectl get serviceaccounts"
+  changed_when: false
+  register: k8s_service_accounts
+  tags: init
+
 - name: Create service account (K8s dashboard)
   command: "kubectl create -f '{{ k8s_service_account_file_dest }}'"
   changed_when: true
+  when: "'default' not in k8s_service_accounts.stdout"
   tags: init
 
 - name: Create clusterRoleBinding (K8s dashboard) files
@@ -137,6 +153,7 @@
 - name: Create clusterRoleBinding (K8s dashboard)
   command: "kubectl create -f '{{ k8s_clusterRoleBinding_file_dest }}'"
   changed_when: true
+  ignore_errors: True
   tags: init
 
 - name: Dump bearer token for K8s dashboard login
@@ -146,7 +163,7 @@
   changed_when: true
   tags: init
 
-- name: Edge / Workstation Install allows pods to schedule on manager
+- name: Edge / Workstation Install allows pods to scheudle on manager
   command: kubectl taint nodes --all node-role.kubernetes.io/master-
-  when: single_node
-  tags: init
+  when: groups['manager'][0] == groups['compute'][0] and groups['compute']|length == 1
+  tags: init

+ 1 - 5
roles/k8s_start_manager/vars/main.yml

@@ -13,10 +13,6 @@
 #  limitations under the License.
 ---
 
-single_node: false
-
-k8s_cni: calico
-
 pod_network_cidr_ip: 10.244.0.0/16
 
 k8s_root_directory: /root/.kube
@@ -47,4 +43,4 @@ k8s_clusterRoleBinding_file_mode: 0655
 
 calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
 
-flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
+flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml

+ 17 - 3
roles/k8s_start_services/tasks/main.yml

@@ -19,9 +19,16 @@
   ignore_errors: True
   tags: init
 
+- name: Get K8s pods
+  command: kubectl get pods --all-namespaces
+  changed_when: false
+  register: k8s_pods
+  tags: init
+
 - name: Deploy MetalLB
   command: "kubectl apply -f '{{ metallb_yaml_url }}'"
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
 
 - name: Create MetalLB Setup Config Files
@@ -45,17 +52,19 @@
 - name: Deploy MetalLB
   command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
 
 - name: Create default setup for MetalLB
   command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
   changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
   tags: init
 
 - name: Start k8s dashboard
   command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
   changed_when: true
-  register: result
+  when: "'kubernetes-dashboard' not in k8s_pods.stdout"
   tags: init
 
 - name: Helm - add stable repo
@@ -81,7 +90,7 @@
 - name: Start NFS Client Provisioner
   command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
   changed_when: true
-  register: result
+  when: "'nfs-client-provisioner' not in k8s_pods.stdout"
   tags: init
 
 - name: Set NFS-Client Provisioner as DEFAULT StorageClass
@@ -97,25 +106,30 @@
     --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
     --generate-name
   changed_when: true
+  when: "'prometheus' not in k8s_pods.stdout"
   tags: init
 
 - name: Install MPI Operator
   command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
   changed_when: true
+  when: "'mpi-operator' not in k8s_pods.stdout"
   tags: init
 
 - name: Install nvidia-device-plugin
   command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
   changed_when: true
+  when: "'nvidia-device-plugin' not in k8s_pods.stdout"
   tags: init
 
 - name: Install GPU Feature Discovery
   command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
   changed_when: true
+  when: "'node-feature-discovery' not in k8s_pods.stdout"
   tags: init
 
 - name: Deploy Xilinx Device plugin
   command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
   changed_when: true
   register: fpga_enable
-  tags: init
+  when: "'fpga-device-plugin' not in k8s_pods.stdout"
+  tags: init

+ 8 - 2
roles/k8s_start_workers/tasks/main.yml

@@ -18,10 +18,16 @@
   changed_when: true
   tags: init
 
+- name: Get hostname
+  command: hostname
+  changed_when: true
+  register: node_hostname
+  tags: init
+
 - name: Execute kubeadm join command
   shell: >
     kubeadm join --token={{ hostvars['K8S_TOKEN_HOLDER']['token'] }}
     --discovery-token-ca-cert-hash sha256:{{ hostvars['K8S_TOKEN_HOLDER']['hash'] }}
     {{ hostvars['K8S_TOKEN_HOLDER']['ip'] }}:{{ apiserver_bind_port }}
-  when: not single_node
-  tags: init
+  when: groups['manager'][0] != groups['compute'][0] and groups['compute']|length >= 1 and node_hostname.stdout not in hostvars['K8S_TOKEN_HOLDER']['k8s_nodes']
+  tags: init

+ 1 - 3
roles/k8s_start_workers/vars/main.yml

@@ -13,6 +13,4 @@
 #  limitations under the License.
 ---
 
-single_node: false
-
-apiserver_bind_port: 6443
+apiserver_bind_port: 6443

+ 11 - 2
roles/slurm_common/tasks/main.yml

@@ -14,7 +14,7 @@
 ---
 
 - name: Get hostname
-  command: hostname -s
+  command: hostname
   register: host_name
   changed_when: true
 
@@ -29,7 +29,7 @@
 - name: Add host name in hosts file
   lineinfile:
     dest: "{{ hosts_dest }}"
-    line: "{{ hostvars[inventory_hostname]['ansible_default_ipv4']['address'] }} {{ host_name.stdout }}"
+    line: "{{ inventory_hostname }} {{ host_name.stdout }}"
     state: present
     create: yes
     mode: "{{ common_mode }}"
@@ -155,6 +155,15 @@
     mode: "{{ gen_mode }}"
     recurse: yes
 
+- name: Give slurm user permission to spool directory
+  file:
+    path: "{{ spool_dir }}"
+    owner: slurm
+    group: slurm
+    state: directory
+    mode: "{{ common_mode }}"
+    recurse: yes
+
 - name: Create slurm pid directory
   file:
     path: "{{ slurm_pidpth }}"

+ 1 - 2
roles/slurm_common/vars/main.yml

@@ -13,8 +13,6 @@
 #  limitations under the License.
 ---
 
-epel_url: https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
-
 common_packages:
    - munge
    - munge-libs
@@ -41,6 +39,7 @@ slurm_uid: "6001"
 slurm_logpth: "/var/log/slurm/"
 slurm_pidpth: "/var/run/slurm/"
 gen_mode: "0755"
+spool_dir: "/var/spool/"
 spool_pth: "/var/spool/slurm/"
 slurmctld_pid: "/var/run/slurmctld.pid"
 slurmd_pid: "/var/run/slurmd.pid"

+ 14 - 10
roles/slurm_manager/tasks/main.yml

@@ -29,7 +29,7 @@
     mode: "{{ tmp_mode }}"
     state: touch
 
-- name: Create slurmctld log file on master
+- name: Create slurmctld log file on manager
   file:
     path: "{{ slurm_logpth }}"
     owner: slurm
@@ -38,14 +38,14 @@
   with_items:
     - slurmctld.log
 
-- name: Create log files on master
+- name: Create log files on manager
   file:
     path: "{{ slurm_logpth }}"
     owner: slurm
     mode: "{{ tmp_mode }}"
     state: touch
   with_items:
-    - "{{ log_files_master }}"
+    - "{{ log_files_manager }}"
 
 - name: Install packages for slurm
   package:
@@ -86,7 +86,7 @@
     warn: no
 
 - name: Verify package md5
-  command: rpm -qa
+  shell: rpm -qa | grep slurm
   ignore_errors: true
   register: verify_result
   changed_when: no
@@ -100,9 +100,10 @@
     chdir: "{{ rpm_path }}"
     warn: no
   changed_when: true
+  when: verify_result.rc != 0
 
 - name: Get the hostname
-  command: hostname -s
+  command: hostname
   register: machine_name
   changed_when: true
 
@@ -147,13 +148,13 @@
   when: "'manager' in group_names"
   tags: firewalld
 
-- name: Get network address/subnet mask through ipaddr
+- name: Get network address/subnet mask
   set_fact:
     network_address: "{{ (ansible_default_ipv4.network + '/' + ansible_default_ipv4.netmask) | ipaddr('network/prefix') }}"
 
 - name: Firewall rule slurm - allow all incoming traffic on internal network
   firewalld:
-    zone: internal
+    zone: public
     rich_rule: 'rule family="{{ family }}" source address="{{ network_address }}" accept'
     permanent: true
     state: enabled
@@ -172,7 +173,10 @@
   tags: install
 
 - name: Grant permissions for slurm db
-  command: mysql -u root -e "GRANT ALL ON slurm_acct_db.* TO '{{ db_user }}'@'{{ db_host }}' identified by '{{ db_password[0] }}'with grant option;"
+  command: >-
+    mysql -u root -e "GRANT ALL ON slurm_acct_db.* TO '{{ db_user }}'@'{{
+    db_host }}' identified by '{{ hostvars['127.0.0.1']['db_password'] }}'with
+    grant option;"
   tags: install
   changed_when: true
 
@@ -206,7 +210,7 @@
   lineinfile:
     path: "{{ slurmdbd_path }}"
     regexp: "StoragePass="
-    line: "StoragePass={{ db_password[0] }}"
+    line: "StoragePass={{ hostvars['127.0.0.1']['db_password'] }}"
 
 - name: Add storage user
   lineinfile:
@@ -230,4 +234,4 @@
   fetch:
     src: "{{ slurm_confpth }}"
     dest: "{{ buffer_path }}"
-    flat: true
+    flat: true

+ 1 - 1
roles/slurm_manager/vars/main.yml

@@ -38,7 +38,7 @@ dev_tools:
    - ncurses-devel
    - gtk2-devel
 
-log_files_master:
+log_files_manager:
    - slurm_jobacct.log
    - slurm_jobcomp.log
 

+ 6 - 8
roles/slurm_start_services/tasks/main.yml

@@ -32,7 +32,7 @@
   tags: install
 
 - name: Start slurmctld on manager
-  systemd:
+  service:
     name: slurmctld
     state: started
   tags: install
@@ -44,24 +44,22 @@
 
 - name: Create slurm cluster
   command: sacctmgr -i add cluster {{ cluster_name }}
-  when: slurm_clusterlist.stdout.find(cluster_name) == 1
+  when: not slurm_clusterlist.stdout
 
 - name: Show account
-  command: sacctmgr show account
+  command: sacctmgr show account -s
   register: account_added
   changed_when: false
 
 - name: Create default slurm group
   command: sacctmgr -i add account defaultgroup Cluster={{ cluster_name }} Description="Default Account" Organization="Default Org"
-  when: account_added.stdout.find(cluster_name) == 1
-  tags: install
+  when: account_added.rc != 0
 
 - name: Check if user exists
-  command: sacctmgr show user
+  command: sacctmgr show user -s
   register: user_added
   changed_when: false
 
 - name: Add root to the default account
   command: sacctmgr -i add user root DefaultAccount=defaultgroup
-  when: account_added.stdout.find(cluster_name) == 1
-  tags: install
+  when: user_added.rc != 0

+ 3 - 3
roles/slurm_workers/tasks/main.yml

@@ -92,7 +92,7 @@
     warn: no
 
 - name: Verify package md5
-  command: rpm -qa
+  shell: rpm -qa | grep slurm
   ignore_errors: true
   register: verify_result
   changed_when: no
@@ -106,9 +106,10 @@
     chdir: "{{ rpm_path }}"
     warn: no
   changed_when: true
+  when: verify_result.rc != 0
 
 - name: Get the hostname
-  command: hostname -s
+  command: hostname
   register: machine_name
   changed_when: true
 
@@ -119,7 +120,6 @@
     line: "NodeName={{ machine_name.stdout }} Sockets={{ hostvars[inventory_hostname]['ansible_facts']['processor_count'] }}
       CoresPerSocket={{ hostvars[inventory_hostname]['ansible_facts']['processor_cores'] }}"
 
-
 - name: Save slurm conf in buffer
   fetch:
     src: "{{ slurm_confpth }}"