Browse Source

Issue #487: Fix for calico pod in ErrImagePull

Signed-off-by: blesson-james <blesson_james@Dellteam.com>
blesson-james 3 years ago
parent
commit
f15a614dbc
30 changed files with 4581 additions and 644 deletions
  1. 1 1
      control_plane/roles/control_plane_repo/files/C4140_inv.xml
  2. 1 1
      control_plane/roles/control_plane_repo/files/C6420_inv.xml
  3. 1 1
      control_plane/roles/control_plane_repo/files/C6520_inv.xml
  4. 1 1
      control_plane/roles/control_plane_repo/files/R240_inv.xml
  5. 1 1
      control_plane/roles/control_plane_repo/files/R340_inv.xml
  6. 1 1
      control_plane/roles/control_plane_repo/files/R440_inv.xml
  7. 1 1
      control_plane/roles/control_plane_repo/files/R540_inv.xml
  8. 1 1
      control_plane/roles/control_plane_repo/files/R640_inv.xml
  9. 1 1
      control_plane/roles/control_plane_repo/files/R650_inv.xml
  10. 1 1
      control_plane/roles/control_plane_repo/files/R740_inv.xml
  11. 1 1
      control_plane/roles/control_plane_repo/files/R740xd2_inv.xml
  12. 1 1
      control_plane/roles/control_plane_repo/files/R740xd_inv.xml
  13. 1 1
      control_plane/roles/control_plane_repo/files/R750_inv.xml
  14. 1 1
      control_plane/roles/control_plane_repo/files/R750xa_inv.xml
  15. 1 1
      control_plane/roles/control_plane_repo/files/R840_inv.xml
  16. 1 1
      control_plane/roles/control_plane_repo/files/R940_inv.xml
  17. 1 1
      control_plane/roles/control_plane_repo/files/R940xa_inv.xml
  18. 3 2
      control_plane/roles/control_plane_repo/tasks/download_fmw_updates.yml
  19. 11 3
      control_plane/roles/control_plane_repo/tasks/install_dsu.yml
  20. 2 2
      control_plane/roles/control_plane_repo/tasks/validate_idrac_vars.yml
  21. 3 1
      control_plane/roles/control_plane_repo/vars/main.yml
  22. 1 1
      omnia.yml
  23. 4090 0
      roles/k8s_start_manager/files/kube-calico.yaml
  24. 63 376
      roles/k8s_start_manager/files/kube-flannel.yaml
  25. 50 8
      roles/k8s_start_manager/tasks/main.yml
  26. 6 2
      roles/k8s_start_manager/vars/main.yml
  27. 102 0
      roles/k8s_start_services/tasks/check_k8s_pods.yml
  28. 220 0
      roles/k8s_start_services/tasks/deploy_k8s_services.yml
  29. 8 232
      roles/k8s_start_services/tasks/main.yml
  30. 5 0
      roles/k8s_start_services/vars/main.yml

File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/C4140_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/C6420_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/C6520_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R240_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R340_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R440_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R540_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R640_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R650_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R740_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R740xd2_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R740xd_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R750_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R750xa_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R840_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R940_inv.xml


File diff suppressed because it is too large
+ 1 - 1
control_plane/roles/control_plane_repo/files/R940xa_inv.xml


+ 3 - 2
control_plane/roles/control_plane_repo/tasks/download_fmw_updates.yml

@@ -14,7 +14,8 @@
 ---
 
 - name: Downloading Firwmare Updates (This process may take few hours depending upon the poweredge_model list)
-  command: 'dsu --destination-type=REP --input-inventory-file="{{ role_path }}/files/{{ item }}_inv.xml" \
+  command: 'dsu --destination-type=REP --source-type=REPOSITORY --input-inventory-file="{{ role_path }}/files/{{ item }}_inv.xml" \
     --destination-location="{{ nfs_share_offline_repo }}" --non-interactive'
   with_items: "{{ poweredge_model.split(',') | map('trim') }}"
-  changed_when: true
+  changed_when: true
+  failed_when: false

+ 11 - 3
control_plane/roles/control_plane_repo/tasks/install_dsu.yml

@@ -29,12 +29,20 @@
 
 - name: Execute bootstrap.cgi
   shell: >
+    set -o pipefail && \
     echo "y" | bash {{ dsu_folder_dest }}/bootstrap.cgi
   changed_when: true
   register: bootstrap_execution_result
   failed_when: "'FAILED' in bootstrap_execution_result.stderr"
 
+- name: Download dell-omnia-system-update.rpm
+  get_url:
+    url: "{{ dsu_rpm_url }}"
+    dest: "{{ dsu_folder_dest }}"
+  register: dsu_rpm_result
+  until: dsu_rpm_result is not failed
+  retries: 20
+
 - name: Install DSU
-  package:
-    name: dell-system-update
-    state: present
+  command: dnf install "{{ dsu_folder_dest }}/{{ dsu_rpm_file }}" -y
+  changed_when: true

+ 2 - 2
control_plane/roles/control_plane_repo/tasks/validate_idrac_vars.yml

@@ -26,7 +26,7 @@
 
 - name: Read poweredge_model file
   command: cat {{ role_path }}/files/poweredge_models.txt
-  ignore_errors: yes
+  failed_when: false
   register: poweredge_models_file_output
   changed_when: false
 
@@ -38,4 +38,4 @@
     success_msg: "{{ poweredge_model_success_msg }}"
     fail_msg: "{{ poweredge_model_fail_msg }}"
   when: firmware_update_required
-  with_items: "{{ poweredge_model.split(',') | map('trim') }}"
+  with_items: "{{ poweredge_model.split(',') | map('trim') }}"

+ 3 - 1
control_plane/roles/control_plane_repo/vars/main.yml

@@ -25,4 +25,6 @@ poweredge_model_fail_msg: "Failed. poweredge_model is incorrect or unsupported.
 # Usage: install_dsu.yml
 dsu_folder_dest: /root/dsu
 dsu_folder_dest_mode: 0644
-bootstrap_repo_url: https://linux.dell.com/repo/hardware/dsu/bootstrap.cgi
+bootstrap_repo_url: https://linux.dell.com/repo/hardware/dsu/bootstrap.cgi
+dsu_rpm_url: https://linux.dell.com/repo/hardware/omnia/os_independent/x86_64/dell-omnia-system-update-1.9.2.1-21.08.00.x86_64.rpm
+dsu_rpm_file: dell-omnia-system-update-1.9.2.1-21.08.00.x86_64.rpm

+ 1 - 1
omnia.yml

@@ -105,7 +105,7 @@
   tags: kubernetes
 
 - name: Start K8s worker servers on manager nodes
-  hosts: manager
+  hosts: manager, compute
   gather_facts: false
   roles:
     - k8s_start_services

File diff suppressed because it is too large
+ 4090 - 0
roles/k8s_start_manager/files/kube-calico.yaml


+ 63 - 376
roles/k8s_start_manager/files/kube-flannel.yaml

@@ -1,5 +1,5 @@
 ---
-apiVersion: extensions/v1beta1
+apiVersion: policy/v1beta1
 kind: PodSecurityPolicy
 metadata:
   name: psp.flannel.unprivileged
@@ -11,14 +11,14 @@ metadata:
 spec:
   privileged: false
   volumes:
-    - configMap
-    - secret
-    - emptyDir
-    - hostPath
+  - configMap
+  - secret
+  - emptyDir
+  - hostPath
   allowedHostPaths:
-    - pathPrefix: "/etc/cni/net.d"
-    - pathPrefix: "/etc/kube-flannel"
-    - pathPrefix: "/run/flannel"
+  - pathPrefix: "/etc/cni/net.d"
+  - pathPrefix: "/etc/kube-flannel"
+  - pathPrefix: "/run/flannel"
   readOnlyRootFilesystem: false
   # Users and groups
   runAsUser:
@@ -31,7 +31,7 @@ spec:
   allowPrivilegeEscalation: false
   defaultAllowPrivilegeEscalation: false
   # Capabilities
-  allowedCapabilities: ['NET_ADMIN']
+  allowedCapabilities: ['NET_ADMIN', 'NET_RAW']
   defaultAddCapabilities: []
   requiredDropCapabilities: []
   # Host namespaces
@@ -43,40 +43,40 @@ spec:
     max: 65535
   # SELinux
   seLinux:
-    # SELinux is unsed in CaaSP
+    # SELinux is unused in CaaSP
     rule: 'RunAsAny'
 ---
 kind: ClusterRole
-apiVersion: rbac.authorization.k8s.io/v1beta1
+apiVersion: rbac.authorization.k8s.io/v1
 metadata:
   name: flannel
 rules:
-  - apiGroups: ['extensions']
-    resources: ['podsecuritypolicies']
-    verbs: ['use']
-    resourceNames: ['psp.flannel.unprivileged']
-  - apiGroups:
-      - ""
-    resources:
-      - pods
-    verbs:
-      - get
-  - apiGroups:
-      - ""
-    resources:
-      - nodes
-    verbs:
-      - list
-      - watch
-  - apiGroups:
-      - ""
-    resources:
-      - nodes/status
-    verbs:
-      - patch
+- apiGroups: ['extensions']
+  resources: ['podsecuritypolicies']
+  verbs: ['use']
+  resourceNames: ['psp.flannel.unprivileged']
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - get
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+  verbs:
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - nodes/status
+  verbs:
+  - patch
 ---
 kind: ClusterRoleBinding
-apiVersion: rbac.authorization.k8s.io/v1beta1
+apiVersion: rbac.authorization.k8s.io/v1
 metadata:
   name: flannel
 roleRef:
@@ -106,6 +106,7 @@ data:
   cni-conf.json: |
     {
       "name": "cbr0",
+      "cniVersion": "0.3.1",
       "plugins": [
         {
           "type": "flannel",
@@ -130,31 +131,42 @@ data:
       }
     }
 ---
-apiVersion: extensions/v1beta1
+apiVersion: apps/v1
 kind: DaemonSet
 metadata:
-  name: kube-flannel-ds-amd64
+  name: kube-flannel-ds
   namespace: kube-system
   labels:
     tier: node
     app: flannel
 spec:
+  selector:
+    matchLabels:
+      app: flannel
   template:
     metadata:
       labels:
         tier: node
         app: flannel
     spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: kubernetes.io/os
+                operator: In
+                values:
+                - linux
       hostNetwork: true
-      nodeSelector:
-        beta.kubernetes.io/arch: amd64
+      priorityClassName: system-node-critical
       tolerations:
       - operator: Exists
         effect: NoSchedule
       serviceAccountName: flannel
       initContainers:
       - name: install-cni
-        image: quay.io/coreos/flannel:v0.11.0-amd64
+        image: quay.io/coreos/flannel:v0.14.0
         command:
         - cp
         args:
@@ -168,13 +180,12 @@ spec:
           mountPath: /etc/kube-flannel/
       containers:
       - name: kube-flannel
-        image: quay.io/coreos/flannel:v0.11.0-amd64
+        image: quay.io/coreos/flannel:v0.14.0
         command:
         - /opt/bin/flanneld
         args:
         - --ip-masq
         - --kube-subnet-mgr
-        - --iface=ib0
         resources:
           requests:
             cpu: "100m"
@@ -185,7 +196,7 @@ spec:
         securityContext:
           privileged: false
           capabilities:
-             add: ["NET_ADMIN"]
+            add: ["NET_ADMIN", "NET_RAW"]
         env:
         - name: POD_NAME
           valueFrom:
@@ -201,336 +212,12 @@ spec:
         - name: flannel-cfg
           mountPath: /etc/kube-flannel/
       volumes:
-        - name: run
-          hostPath:
-            path: /run/flannel
-        - name: cni
-          hostPath:
-            path: /etc/cni/net.d
-        - name: flannel-cfg
-          configMap:
-            name: kube-flannel-cfg
----
-apiVersion: extensions/v1beta1
-kind: DaemonSet
-metadata:
-  name: kube-flannel-ds-arm64
-  namespace: kube-system
-  labels:
-    tier: node
-    app: flannel
-spec:
-  template:
-    metadata:
-      labels:
-        tier: node
-        app: flannel
-    spec:
-      hostNetwork: true
-      nodeSelector:
-        beta.kubernetes.io/arch: arm64
-      tolerations:
-      - operator: Exists
-        effect: NoSchedule
-      serviceAccountName: flannel
-      initContainers:
-      - name: install-cni
-        image: quay.io/coreos/flannel:v0.11.0-arm64
-        command:
-        - cp
-        args:
-        - -f
-        - /etc/kube-flannel/cni-conf.json
-        - /etc/cni/net.d/10-flannel.conflist
-        volumeMounts:
-        - name: cni
-          mountPath: /etc/cni/net.d
-        - name: flannel-cfg
-          mountPath: /etc/kube-flannel/
-      containers:
-      - name: kube-flannel
-        image: quay.io/coreos/flannel:v0.11.0-arm64
-        command:
-        - /opt/bin/flanneld
-        args:
-        - --ip-masq
-        - --kube-subnet-mgr
-        - --iface=ib0
-        resources:
-          requests:
-            cpu: "100m"
-            memory: "50Mi"
-          limits:
-            cpu: "100m"
-            memory: "50Mi"
-        securityContext:
-          privileged: false
-          capabilities:
-             add: ["NET_ADMIN"]
-        env:
-        - name: POD_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.name
-        - name: POD_NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        volumeMounts:
-        - name: run
-          mountPath: /run/flannel
-        - name: flannel-cfg
-          mountPath: /etc/kube-flannel/
-      volumes:
-        - name: run
-          hostPath:
-            path: /run/flannel
-        - name: cni
-          hostPath:
-            path: /etc/cni/net.d
-        - name: flannel-cfg
-          configMap:
-            name: kube-flannel-cfg
----
-apiVersion: extensions/v1beta1
-kind: DaemonSet
-metadata:
-  name: kube-flannel-ds-arm
-  namespace: kube-system
-  labels:
-    tier: node
-    app: flannel
-spec:
-  template:
-    metadata:
-      labels:
-        tier: node
-        app: flannel
-    spec:
-      hostNetwork: true
-      nodeSelector:
-        beta.kubernetes.io/arch: arm
-      tolerations:
-      - operator: Exists
-        effect: NoSchedule
-      serviceAccountName: flannel
-      initContainers:
-      - name: install-cni
-        image: quay.io/coreos/flannel:v0.11.0-arm
-        command:
-        - cp
-        args:
-        - -f
-        - /etc/kube-flannel/cni-conf.json
-        - /etc/cni/net.d/10-flannel.conflist
-        volumeMounts:
-        - name: cni
-          mountPath: /etc/cni/net.d
-        - name: flannel-cfg
-          mountPath: /etc/kube-flannel/
-      containers:
-      - name: kube-flannel
-        image: quay.io/coreos/flannel:v0.11.0-arm
-        command:
-        - /opt/bin/flanneld
-        args:
-        - --ip-masq
-        - --kube-subnet-mgr
-        - --iface=ib0
-        resources:
-          requests:
-            cpu: "100m"
-            memory: "50Mi"
-          limits:
-            cpu: "100m"
-            memory: "50Mi"
-        securityContext:
-          privileged: false
-          capabilities:
-             add: ["NET_ADMIN"]
-        env:
-        - name: POD_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.name
-        - name: POD_NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        volumeMounts:
-        - name: run
-          mountPath: /run/flannel
-        - name: flannel-cfg
-          mountPath: /etc/kube-flannel/
-      volumes:
-        - name: run
-          hostPath:
-            path: /run/flannel
-        - name: cni
-          hostPath:
-            path: /etc/cni/net.d
-        - name: flannel-cfg
-          configMap:
-            name: kube-flannel-cfg
----
-apiVersion: extensions/v1beta1
-kind: DaemonSet
-metadata:
-  name: kube-flannel-ds-ppc64le
-  namespace: kube-system
-  labels:
-    tier: node
-    app: flannel
-spec:
-  template:
-    metadata:
-      labels:
-        tier: node
-        app: flannel
-    spec:
-      hostNetwork: true
-      nodeSelector:
-        beta.kubernetes.io/arch: ppc64le
-      tolerations:
-      - operator: Exists
-        effect: NoSchedule
-      serviceAccountName: flannel
-      initContainers:
-      - name: install-cni
-        image: quay.io/coreos/flannel:v0.11.0-ppc64le
-        command:
-        - cp
-        args:
-        - -f
-        - /etc/kube-flannel/cni-conf.json
-        - /etc/cni/net.d/10-flannel.conflist
-        volumeMounts:
-        - name: cni
-          mountPath: /etc/cni/net.d
-        - name: flannel-cfg
-          mountPath: /etc/kube-flannel/
-      containers:
-      - name: kube-flannel
-        image: quay.io/coreos/flannel:v0.11.0-ppc64le
-        command:
-        - /opt/bin/flanneld
-        args:
-        - --ip-masq
-        - --kube-subnet-mgr
-        - --iface=ib0
-        resources:
-          requests:
-            cpu: "100m"
-            memory: "50Mi"
-          limits:
-            cpu: "100m"
-            memory: "50Mi"
-        securityContext:
-          privileged: false
-          capabilities:
-             add: ["NET_ADMIN"]
-        env:
-        - name: POD_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.name
-        - name: POD_NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        volumeMounts:
-        - name: run
-          mountPath: /run/flannel
-        - name: flannel-cfg
-          mountPath: /etc/kube-flannel/
-      volumes:
-        - name: run
-          hostPath:
-            path: /run/flannel
-        - name: cni
-          hostPath:
-            path: /etc/cni/net.d
-        - name: flannel-cfg
-          configMap:
-            name: kube-flannel-cfg
----
-apiVersion: extensions/v1beta1
-kind: DaemonSet
-metadata:
-  name: kube-flannel-ds-s390x
-  namespace: kube-system
-  labels:
-    tier: node
-    app: flannel
-spec:
-  template:
-    metadata:
-      labels:
-        tier: node
-        app: flannel
-    spec:
-      hostNetwork: true
-      nodeSelector:
-        beta.kubernetes.io/arch: s390x
-      tolerations:
-      - operator: Exists
-        effect: NoSchedule
-      serviceAccountName: flannel
-      initContainers:
-      - name: install-cni
-        image: quay.io/coreos/flannel:v0.11.0-s390x
-        command:
-        - cp
-        args:
-        - -f
-        - /etc/kube-flannel/cni-conf.json
-        - /etc/cni/net.d/10-flannel.conflist
-        volumeMounts:
-        - name: cni
-          mountPath: /etc/cni/net.d
-        - name: flannel-cfg
-          mountPath: /etc/kube-flannel/
-      containers:
-      - name: kube-flannel
-        image: quay.io/coreos/flannel:v0.11.0-s390x
-        command:
-        - /opt/bin/flanneld
-        args:
-        - --ip-masq
-        - --kube-subnet-mgr
-        - --iface=ib0
-        resources:
-          requests:
-            cpu: "100m"
-            memory: "50Mi"
-          limits:
-            cpu: "100m"
-            memory: "50Mi"
-        securityContext:
-          privileged: false
-          capabilities:
-             add: ["NET_ADMIN"]
-        env:
-        - name: POD_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.name
-        - name: POD_NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.namespace
-        volumeMounts:
-        - name: run
-          mountPath: /run/flannel
-        - name: flannel-cfg
-          mountPath: /etc/kube-flannel/
-      volumes:
-        - name: run
-          hostPath:
-            path: /run/flannel
-        - name: cni
-          hostPath:
-            path: /etc/cni/net.d
-        - name: flannel-cfg
-          configMap:
-            name: kube-flannel-cfg
+      - name: run
+        hostPath:
+          path: /run/flannel
+      - name: cni
+        hostPath:
+          path: /etc/cni/net.d
+      - name: flannel-cfg
+        configMap:
+          name: kube-flannel-cfg

+ 50 - 8
roles/k8s_start_manager/tasks/main.yml

@@ -178,21 +178,63 @@
   retries: 10
   tags: install
 
+- name: Create yaml repo for setup
+  file:
+    path: "{{ yaml_repo_dir_path }}"
+    state: directory
+    mode: "{{ yaml_repo_dir_mode }}"
+  tags: init
+
+- name: Delete Calico yaml file if exists
+  file:
+    path: "{{ calico_yml_file_path }}"
+    state: absent
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
+  tags: init
+
+- name: Copy Calico yaml file
+  copy:
+    src: kube-calico.yaml
+    dest: "{{ calico_yml_file_path }}"
+    owner: root
+    group: root
+    mode: "{{ calico_yml_file_mode }}"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
+  tags: init
+
 - name: Setup Calico SDN network
-  command: "kubectl apply -f '{{ calico_yml_url }}'"
+  command: "kubectl apply -f '{{ calico_yml_file_path }}'"
   when: hostvars['127.0.0.1']['k8s_cni'] == "calico"
   tags: init
 
-- name: Setup Flannel SDN network
-  command: "kubectl apply -f '{{ flannel_yml_url }}'"
+- name: Delete Flannel yaml file if exists
+  file:
+    path: "{{ flannel_yml_file_path }}"
+    state: absent
   when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
   tags: init
 
-- name: Create yaml repo for setup
-  file:
-    path: "{{ yaml_repo_dir_path }}"
-    state: directory
-    mode: "{{ yaml_repo_dir_mode }}"
+- name: Copy Flannel yaml file
+  copy:
+    src: kube-flannel.yaml
+    dest: "{{ flannel_yml_file_path }}"
+    owner: root
+    group: root
+    mode: "{{ flannel_yml_file_mode }}"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
+  tags: init
+
+- name: Replace flannel network cidr
+  replace:
+    path: "{{ flannel_yml_file_path }}"
+    regexp: "10.244.0.0/16"
+    replace: "{{ hostvars['127.0.0.1']['k8s_pod_network_cidr'] }}"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
+  tags: init
+
+- name: Setup Flannel SDN network
+  command: "kubectl apply -f '{{ flannel_yml_file_path }}'"
+  when: hostvars['127.0.0.1']['k8s_cni'] == "flannel"
   tags: init
 
 - name: Create service account (K8s dashboard) files

+ 6 - 2
roles/k8s_start_manager/vars/main.yml

@@ -41,6 +41,10 @@ cluster_role_binding_file_dest: /root/k8s/create_clusterRoleBinding.yaml
 
 cluster_role_binding_file_mode: 0655
 
-calico_yml_url: https://docs.projectcalico.org/manifests/calico.yaml
+calico_yml_file_path: /root/k8s/kube-calico.yaml
 
-flannel_yml_url: https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
+calico_yml_file_mode: 0644
+
+flannel_yml_file_path: /root/k8s/kube-flannel.yaml
+
+flannel_yml_file_mode: 0644

+ 102 - 0
roles/k8s_start_services/tasks/check_k8s_pods.yml

@@ -0,0 +1,102 @@
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Wait for calico pod to come to ready state
+  command: "kubectl wait --for=condition=ready -n kube-system pod -l k8s-app=calico-node --timeout=10m"
+  changed_when: false
+  register: calico_pod_status
+  failed_when: false
+  when:
+    - "'manager' in group_names"
+    - hostvars['127.0.0.1']['k8s_cni'] == "calico"
+  tags: install
+
+- name: Wait for flannel pod to come to ready state
+  command: "kubectl wait --for=condition=ready -n kube-system pod -l app=flannel --timeout=10m"
+  changed_when: false
+  register: flannel_pod_status
+  failed_when: false
+  when:
+    - "'manager' in group_names"
+    - hostvars['127.0.0.1']['k8s_cni'] == "flannel"
+  tags: install
+
+- name: Wait for nfs-client-provisioner pod to come to ready state
+  command: "kubectl wait --for=condition=ready -n default pod -l app=nfs-client-provisioner --timeout=10m"
+  changed_when: false
+  register: nfs_pod_status
+  failed_when: false
+  when:
+    - "'manager' in group_names"
+    - calico_pod_status is not failed or flannel_pod_status is not failed
+  tags: install
+
+- name: Wait for volcano-scheduler pod to come to ready state
+  command: "kubectl wait --for=condition=ready -n volcano-system pod -l app=volcano-scheduler --timeout=5m"
+  changed_when: false
+  register: volcano_pod_status
+  failed_when: false
+  when:
+    - "'manager' in group_names"
+    - nfs_pod_status is not failed
+  tags: install
+
+- name: Get K8s pods
+  command: kubectl get pods --all-namespaces
+  changed_when: false
+  register: k8s_pods
+  when: "'manager' in group_names"
+  tags: install
+
+- name: Add k8s_pods_status to dummy host
+  add_host:
+    name: "check_k8s_pods"
+    k8s_pods_status: "{{ k8s_pods.stdout }}"
+  tags: install
+
+- name: Fail message
+  fail:
+    msg: "{{ docker_pull_limit_msg }}"
+  when:
+    - "'ImagePullBackOff' in hostvars['check_k8s_pods']['k8s_pods_status'] or 'ErrImagePull' in hostvars['check_k8s_pods']['k8s_pods_status']"
+    - not hostvars['127.0.0.1']['docker_username'] and not hostvars['127.0.0.1']['docker_password']
+
+- name: Docker login
+  command: docker login -u {{ hostvars['127.0.0.1']['docker_username'] }} -p {{ hostvars['127.0.0.1']['docker_password'] }}
+  changed_when: true
+  register: docker_login_output
+  failed_when: false
+  when:
+    - "'ImagePullBackOff' in hostvars['check_k8s_pods']['k8s_pods_status'] or 'ErrImagePull' in hostvars['check_k8s_pods']['k8s_pods_status']"
+    - hostvars['127.0.0.1']['docker_username'] or hostvars['127.0.0.1']['docker_password']
+    - "'compute' in group_names"
+  no_log: true
+
+- name: Docker login check
+  fail:
+    msg: "{{ docker_login_fail_msg }}"
+  when:
+    - docker_login_output is failed
+    - "'compute' in group_names"
+
+- name: Pull K8s services docker images
+  command: docker pull {{ item }}
+  with_items: "{{ k8s_docker_images }}"
+  when:
+    - "'ImagePullBackOff' in hostvars['check_k8s_pods']['k8s_pods_status'] or 'ErrImagePull' in hostvars['check_k8s_pods']['k8s_pods_status']"
+    - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
+  register: docker_image_pull_result
+  until: docker_image_pull_result is not failed
+  retries: 5

+ 220 - 0
roles/k8s_start_services/tasks/deploy_k8s_services.yml

@@ -0,0 +1,220 @@
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+---
+
+- name: Include common variables
+  include_vars: ../../slurm_exporter/vars/main.yml
+
+- name: Include k8s_nfs_server_setup variables
+  include_vars: ../../k8s_nfs_server_setup/vars/main.yml
+
+- name: Include powervault_me4_nfs variables
+  include_vars: ../../powervault_me4_nfs/vars/main.yml
+
+- name: Wait for CoreDNS to restart
+  command: kubectl rollout status deployment/coredns -n kube-system  --timeout=5m
+  changed_when: false
+  failed_when: false
+  tags: init
+
+- name: Get K8s pods
+  command: kubectl get pods --all-namespaces
+  changed_when: false
+  register: k8s_pods
+  tags: init
+
+- name: Deploy MetalLB
+  command: "kubectl apply -f '{{ metallb_yaml_url }}'"
+  changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
+  tags: init
+
+- name: Create MetalLB Setup Config Files
+  copy:
+    src: metal-config.yaml
+    dest: "{{ metallb_config_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ metallb_config_file_mode }}"
+  tags: init
+
+- name: Create MetalLB Setup Deployment Files
+  copy:
+    src: metallb.yaml
+    dest: "{{ metallb_deployment_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ metallb_deployment_file_mode }}"
+  tags: init
+
+- name: Deploy MetalLB
+  command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
+  changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
+  tags: init
+
+- name: Create default setup for MetalLB
+  command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
+  changed_when: true
+  when: "'metallb' not in k8s_pods.stdout"
+  tags: init
+
+- name: Start k8s dashboard
+  command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
+  changed_when: true
+  when: "'kubernetes-dashboard' not in k8s_pods.stdout"
+  tags: init
+
+- name: Copy k8s_dashboard_admin.yml file
+  copy:
+    src: k8s_dashboard_admin.yaml
+    dest: "{{ k8s_dashboard_admin_file_dest }}"
+    owner: root
+    group: root
+    mode: "{{ k8s_dashboard_admin_file_mode }}"
+
+- name: Create admin user for K8s dashboard
+  command: "kubectl apply -f {{ k8s_dashboard_admin_file_dest }}"
+  changed_when: true
+
+- name: Helm - add stable repo
+  command: "helm repo add stable '{{ helm_stable_repo_url }}'"
+  changed_when: true
+  tags: init
+
+- name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
+  command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
+  changed_when: true
+  tags: init
+
+- name: Helm - add Nvidia GPU discovery (nvgfd) repo
+  command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
+  changed_when: true
+  tags: init
+
+- name: Helm - update repo
+  command: helm repo update
+  changed_when: true
+  tags: init
+
+- name: Start NFS Client Provisioner
+  command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
+  changed_when: true
+  when: "'nfs-client-provisioner' not in k8s_pods.stdout"
+  tags: init
+
+- name: Set NFS-Client Provisioner as DEFAULT StorageClass
+  shell: >
+    kubectl patch storageclasses.storage.k8s.io nfs-client \
+    -p '{ "metadata": { "annotations":{ "storageclass.kubernetes.io/is-default-class":"true" }}}'
+  changed_when: true
+  tags: init
+
+- name: Check if prometheus is installed on the host
+  stat:
+    path: "{{ prometheus_path_on_host }}"
+  register: prometheus_status
+  changed_when: False
+  ignore_errors: yes
+  tags: init
+
+- name: Delete prometheus installed on host if it exists
+  file:
+    path: "{{ prometheus_path_on_host }}"
+    state: absent
+  when: prometheus_status.stat.exists
+  tags: init
+
+- name: Copy the slurm exporter config file
+  copy:
+    src: "{{ slurm_exporter_config_file }}"
+    dest: "{{ slurm_exporter_config_file_path }}"
+    owner: root
+    group: root
+    mode: "{{ slurm_exporter_file_mode }}"
+  tags: init
+
+- name: Fetch the public IP of the host
+  shell: >
+    set -o pipefail && \
+      ip route get 8.8.8.8 | awk '{print $7}'
+  register: public_ip
+  changed_when: False
+  tags: init
+
+- name: Add the host IP to config file
+  replace:
+    path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}"
+    regexp: "localhost:8080"
+    replace: "{{ public_ip.stdout }}:{{ slurm_exporter_port }}"
+  tags: init
+
+- name: Prometheus deployment
+  command: >
+    helm install stable/prometheus \
+    --set-file extraScrapeConfigs="{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" \
+    --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
+    --generate-name
+  changed_when: true
+  when: "'prometheus' not in k8s_pods.stdout"
+  tags: init
+
+- name: Install MPI Operator
+  command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
+  changed_when: true
+  when: "'mpi-operator' not in k8s_pods.stdout"
+  tags: init
+
+- name: Install nvidia-device-plugin
+  command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
+  changed_when: true
+  when: "'nvidia-device-plugin' not in k8s_pods.stdout"
+  tags: init
+
+- name: Install GPU Feature Discovery
+  command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
+  changed_when: true
+  when: "'node-feature-discovery' not in k8s_pods.stdout"
+  tags: init
+
+- name: Deploy Xilinx Device plugin
+  command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
+  changed_when: true
+  register: fpga_enable
+  when: "'fpga-device-plugin' not in k8s_pods.stdout"
+  tags: init
+
+- name: Deploy ROCm Device plugin
+  command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'"
+  changed_when: true
+  register: amd_gpu_enable
+  when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
+  tags: init
+
+- name: Deploy Volcano Scheduling
+  command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
+  changed_when: true
+  when: "'volcano-system' not in k8s_pods.stdout"
+  tags: init
+
+- name: Install Spark Operator
+  command: "helm repo add spark-operator '{{ spark_operator_repo }}'"
+  changed_when: true
+  tags: init
+
+- name: Install Spark Operator Namespace
+  command: helm install my-release spark-operator/spark-operator --set image.tag={{ operator_image_tag }} --namespace spark-operator --create-namespace
+  changed_when: true
+  when: "'spark-operator' not in k8s_pods.stdout"
+  tags: init

+ 8 - 232
roles/k8s_start_services/tasks/main.yml

@@ -1,4 +1,4 @@
-#  Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
+#  Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -13,235 +13,11 @@
 #  limitations under the License.
 ---
 
-- name: Include common variables
-  include_vars: ../../slurm_exporter/vars/main.yml
+- name: Deploy K8s services
+  include_tasks: deploy_k8s_services.yml
+  when: "'manager' in group_names"
+  tags: install
 
-- name: Wait for CoreDNS to restart
-  command: kubectl rollout status deployment/coredns -n kube-system  --timeout=5m
-  changed_when: false
-  failed_when: false
-  tags: init
-
-- name: Get K8s pods
-  command: kubectl get pods --all-namespaces
-  changed_when: false
-  register: k8s_pods
-  tags: init
-
-- name: Deploy MetalLB
-  command: "kubectl apply -f '{{ metallb_yaml_url }}'"
-  changed_when: true
-  when: "'metallb' not in k8s_pods.stdout"
-  tags: init
-
-- name: Create MetalLB Setup Config Files
-  copy:
-    src: metal-config.yaml
-    dest: "{{ metallb_config_file_dest }}"
-    owner: root
-    group: root
-    mode: "{{ metallb_config_file_mode }}"
-  tags: init
-
-- name: Create MetalLB Setup Deployment Files
-  copy:
-    src: metallb.yaml
-    dest: "{{ metallb_deployment_file_dest }}"
-    owner: root
-    group: root
-    mode: "{{ metallb_deployment_file_mode }}"
-  tags: init
-
-- name: Deploy MetalLB
-  command: "kubectl apply -f '{{ metallb_deployment_file_dest }}'"
-  changed_when: true
-  when: "'metallb' not in k8s_pods.stdout"
-  tags: init
-
-- name: Create default setup for MetalLB
-  command: "kubectl apply -f '{{ metallb_config_file_dest }}'"
-  changed_when: true
-  when: "'metallb' not in k8s_pods.stdout"
-  tags: init
-
-- name: Start k8s dashboard
-  command: "kubectl create -f '{{ k8s_dashboard_yaml_url }}'"
-  changed_when: true
-  when: "'kubernetes-dashboard' not in k8s_pods.stdout"
-  tags: init
-
-- name: Copy k8s_dashboard_admin.yml file
-  copy:
-    src: k8s_dashboard_admin.yaml
-    dest: "{{ k8s_dashboard_admin_file_dest }}"
-    owner: root
-    group: root
-    mode: "{{ k8s_dashboard_admin_file_mode }}"
-
-- name: Create admin user for K8s dashboard
-  command: "kubectl apply -f {{ k8s_dashboard_admin_file_dest }}"
-  changed_when: true
-
-- name: Helm - add stable repo
-  command: "helm repo add stable '{{ helm_stable_repo_url }}'"
-  changed_when: true
-  tags: init
-
-- name: Helm - add Nvidia k8s-device-plugin (nvdp) repo
-  command: "helm repo add nvdp '{{ nvidia_k8s_device_plugin_repo_url }}'"
-  changed_when: true
-  tags: init
-
-- name: Helm - add Nvidia GPU discovery (nvgfd) repo
-  command: "helm repo add nvgfd '{{ nvidia_gpu_discovery_repo_url }}'"
-  changed_when: true
-  tags: init
-
-- name: Helm - update repo
-  command: helm repo update
-  changed_when: true
-  tags: init
-
-- name: Start NFS Client Provisioner
-  command: "helm install stable/nfs-client-provisioner --set nfs.server='{{ nfs_server }}' --set nfs.path='{{ nfs_path }}' --generate-name"
-  changed_when: true
-  when: "'nfs-client-provisioner' not in k8s_pods.stdout"
-  tags: init
-
-- name: Set NFS-Client Provisioner as DEFAULT StorageClass
-  shell: >
-    kubectl patch storageclasses.storage.k8s.io nfs-client \
-    -p '{ "metadata": { "annotations":{ "storageclass.kubernetes.io/is-default-class":"true" }}}'
-  changed_when: true
-  tags: init
-
-- name: Check if prometheus is installed on the host
-  stat:
-    path: "{{ prometheus_path_on_host }}"
-  register: prometheus_status
-  changed_when: False
-  ignore_errors: yes
-  tags: init
-
-- name: Delete prometheus installed on host if it exists
-  file:
-    path: "{{ prometheus_path_on_host }}"
-    state: absent
-  when: prometheus_status.stat.exists
-  tags: init
-
-- name: Copy the slurm exporter config file
-  copy:
-    src: "{{ slurm_exporter_config_file }}"
-    dest: "{{ slurm_exporter_config_file_path }}"
-    owner: root
-    group: root
-    mode: "{{ slurm_exporter_file_mode }}"
-  tags: init
-
-- name: Fetch the public IP of the host
-  shell: >
-    set -o pipefail && \
-      ip route get 8.8.8.8 | awk '{print $7}'
-  register: public_ip
-  changed_when: False
-  tags: init
-
-- name: Add the host IP to config file
-  replace:
-    path: "{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}"
-    regexp: "localhost:8080"
-    replace: "{{ public_ip.stdout }}:{{ slurm_exporter_port }}"
-  tags: init
-
-- name: Prometheus deployment
-  command: >
-    helm install stable/prometheus \
-    --set-file extraScrapeConfigs="{{ slurm_exporter_config_file_path }}{{ slurm_exporter_config_file }}" \
-    --set alertmanager.persistentVolume.storageClass=nfs-client,server.persistentVolume.storageClass=nfs-client,server.service.type=LoadBalancer \
-    --generate-name
-  changed_when: true
-  when: "'prometheus' not in k8s_pods.stdout"
-  tags: init
-
-- name: Install MPI Operator
-  command: "kubectl create -f '{{ mpi_operator_yaml_url }}'"
-  changed_when: true
-  when: "'mpi-operator' not in k8s_pods.stdout"
-  tags: init
-
-- name: Install nvidia-device-plugin
-  command: "helm install --version='{{ nvidia_device_plugin_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvdp/nvidia-device-plugin"
-  changed_when: true
-  when: "'nvidia-device-plugin' not in k8s_pods.stdout"
-  tags: init
-
-- name: Install GPU Feature Discovery
-  command: "helm install --version='{{ gpu_feature_discovery_version }}' --generate-name --set migStrategy='{{ mig_strategy }}' nvgfd/gpu-feature-discovery"
-  changed_when: true
-  when: "'node-feature-discovery' not in k8s_pods.stdout"
-  tags: init
-
-- name: Deploy Xilinx Device plugin
-  command: "kubectl create -f '{{ fpga_device_plugin_yaml_url }}'"
-  changed_when: true
-  register: fpga_enable
-  when: "'fpga-device-plugin' not in k8s_pods.stdout"
-  tags: init
-
-- name: Deploy ROCm Device plugin
-  command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'"
-  changed_when: true
-  register: amd_gpu_enable
-  when: "'amdgpu-device-plugin' not in k8s_pods.stdout"
-  tags: init
-
-- name: Deploy Volcano Scheduling
-  command: "kubectl apply -f '{{ volcano_scheduling_yaml_url }}'"
-  changed_when: true
-  when: "'volcano-system' not in k8s_pods.stdout"
-  tags: init
-
-- name: Install Spark Operator
-  command: "helm repo add spark-operator '{{ spark_operator_repo }}'"
-  changed_when: true
-  tags: init
-
-- name: Install Spark Operator Namespace
-  command: helm install my-release spark-operator/spark-operator --set image.tag={{ operator_image_tag }} --namespace spark-operator --create-namespace
-  changed_when: true
-  when: "'spark-operator' not in k8s_pods.stdout"
-  tags: init
-
-- name: Wait for k8s pod to come to ready state
-  block:
-    - name: Wait for k8s pod to come to ready state
-      command: "kubectl wait --for=condition=ready -n {{ item.namespace }} pod -l app={{ item.app }} --timeout={{ item.timeout }}"
-      with_items:
-        - { namespace: "default", app: "nfs-client-provisioner", timeout: "10m" }
-        - { namespace: "volcano-system", app: "volcano-scheduler", timeout: "5m" }
-      changed_when: false
-      tags: install
-  rescue:
-    - name: Get K8s pods
-      command: kubectl get pods --all-namespaces
-      changed_when: false
-      register: k8s_pods
-      tags: init
-
-    - name: Fail message
-      fail:
-        msg: "{{ docker_pull_limit_msg }}"
-      when:
-        - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
-        - not hostvars['127.0.0.1']['docker_username'] and not hostvars['127.0.0.1']['docker_password']
-
-    - name: Pull K8s services docker images
-      command: docker pull {{ item }}
-      with_items: "{{ k8s_docker_images }}"
-      when:
-        - "'ImagePullBackOff' in k8s_pods.stdout or 'ErrImagePull' in k8s_pods.stdout"
-        - hostvars['127.0.0.1']['docker_username'] and hostvars['127.0.0.1']['docker_password']
-      register: docker_image_pull_result
-      until: docker_image_pull_result is not failed
-      retries: 5
+- name: Check K8s pods
+  include_tasks: check_k8s_pods.yml
+  tags: install

+ 5 - 0
roles/k8s_start_services/vars/main.yml

@@ -18,6 +18,7 @@ k8s_docker_images:
   - docker.io/calico/cni:v3.19.1
   - docker.io/calico/pod2daemon-flexvol:v3.19.1
   - docker.io/calico/node:v3.19.1
+  - quay.io/coreos/flannel:v0.14.0
   - xilinxatg/xilinx_k8s_fpga_plugin:2020.11.24
   - nvidia/k8s-device-plugin:v0.7.0
   - quay.io/external_storage/nfs-client-provisioner:v3.1.0-k8s1.11
@@ -35,6 +36,10 @@ k8s_docker_images:
   - volcanosh/vc-controller-manager:latest
   - volcanosh/vc-scheduler:latest
   - volcanosh/vc-webhook-manager:latest
+  - mpioperator/mpi-operator:latest
+  - rocm/k8s-device-plugin
+
+docker_login_fail_msg: "Docker login failed! Please check the credentials and re-execute playbook."
 
 docker_pull_limit_msg: "You have reached your docker pull rate limit. Please provide docker credentials in omnia_config.yml and try again"