deepops/playbooks/k8s-cluster.yml at 87cf0569c1cdf0bb545bafe1016517d29aa3c293 · dholt/deepops · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
---
# Kubernetes Cluster Playbook

# Set facts depending on container runtime
# Use GPU operator when container runtime is not docker
# etcd_deployment_type must be `host` when container_manager is not `docker`
- hosts: all
  tasks:
    - name: Set facts when not using docker container runtime (default)
      set_fact:
        deepops_gpu_operator_enabled: true
        etcd_deployment_type: host
      when:
        - container_manager is defined
        - container_manager != "docker"
    - name: Set facts when using Docker container runtime
      set_fact:
        etcd_deployment_type: docker
        gpu_operator_default_runtime: "docker"
      when:
        - container_manager is defined
        - container_manager == "docker"

- name: Install python required for Ansible
  import_playbook: bootstrap/bootstrap-python.yml
  tags:
    - bootstrap

# Make sure Kubespray submodule is correct
- hosts: all
  gather_facts: false
  tasks:
    - name: make sure kubespray is at the correct version
      command: git submodule update --init
      args:
        chdir: "{{ playbook_dir | dirname }}"
      delegate_to: localhost
      run_once: true
  vars:
    ansible_become: no
    ansible_connection: local
  tags:
    - local

- name: Set up SSH keys if needed
  import_playbook: bootstrap/bootstrap-ssh.yml
  tags:
    - bootstrap
- name: Set up passwordless sudo if needed
  import_playbook: bootstrap/bootstrap-sudo.yml
  tags:
    - bootstrap

- name: Disable cloud-init
  import_playbook: generic/disable-cloud-init.yml
  when: deepops_disable_cloud_init|default(true)

- name: Configure Chrony (NTP) sync
  import_playbook: generic/chrony-client.yml
  when: chrony_install|default(true)

- name: Install the OpenShift API libraries required by the GPU plugin
  import_playbook: bootstrap/bootstrap-openshift.yml
  tags:
    - bootstrap

- name: Configure hostnames, /etc/hosts
  import_playbook: generic/hosts.yml

- name: Set up a local cluster container registry
  import_playbook: container/standalone-container-registry.yml hostlist=kube_control_plane
  when: kube_enable_container_registry|default(false)

# Install 'sshpass' program for: https://github.com/ansible/ansible/issues/56629
- hosts: all
  gather_facts: true
  tasks:
    - name: install epel
      package:
        name: epel-release
        state: present
      when: ansible_os_family == "RedHat"
    - name: install sshpass
      package:
        name: sshpass
        state: present
  environment: "{{proxy_env if proxy_env is defined else {}}}"
  tags:
    - bootstrap

# Un-hold container runtime packages on Ubuntu
# In some cases, container runtime packages can be in the 'held' state, preventing
# them from being removed and causing the install of the containerd runtime to fail
- hosts: all
  gather_facts: true
  tasks:
    - name: un-hold container runtime packages on Ubuntu
      dpkg_selections:
        name: "{{ item }}"
        selection: purge
      with_items:
        - docker-ce
        - docker-ce-cli
        - docker-ce-rootless-extras
        - containerd.io
      when:
        - container_manager is defined and container_manager != "docker"
        - ansible_distribution == "Ubuntu"
      ignore_errors: yes
  environment: "{{proxy_env if proxy_env is defined else {}}}"

# Install Kubernetes
# for configuration, see: config/group_vars/k8s_cluster.yml
- name: Install Kubernetes
  import_playbook: ../submodules/kubespray/cluster.yml
  tags:
    - k8s

# Disable swap (required for k8s), kubespray method doesn't quite cut it
- hosts: all
  become: true
  tasks:
    - name: remove swap from fstab
      lineinfile: path=/etc/fstab regexp='swap' state=absent
    - name: disable swap
      command: swapoff -a
  tags:
    - swap

# Manage Kubernetes cluster access config file
- hosts: k8s_cluster
  gather_facts: false
  vars:
    ansible_become: no
  tasks:
    - name: create artifacts directory
      file:
        path: "{{ artifacts_dir }}"
        state: directory
      delegate_to: localhost
      run_once: true
    - name: create kube config directory for current user
      file:
        path: "{{ lookup('env','HOME') + '/.kube/' }}"
        state: directory
      delegate_to: localhost
      run_once: true
    - name: check for kube config file
      stat:
        path: "{{ artifacts_dir }}/admin.conf"
      register: kubeconf
      delegate_to: localhost
      run_once: true
    - name: copy kube config file for current user
      copy:
        src: "{{ artifacts_dir }}/admin.conf"
        dest: "{{ lookup('env','HOME') + '/.kube/config' }}"
        backup: yes
      when: kubeconf.stat.exists
      delegate_to: localhost
      run_once: true
  tags:
    - local

- name: Install NVIDIA driver on GPU servers
  import_playbook: nvidia-software/nvidia-driver.yml
  vars:
    hostlist: "k8s_cluster"
  tags:
    - nvidia
  when: deepops_gpu_operator_enabled|default(true) | bool == false or
        gpu_operator_preinstalled_nvidia_software|default(true)

- name: Install NVIDIA container runtime on GPU servers
  import_playbook: container/nvidia-docker.yml
  vars:
    hostlist: "k8s_cluster"
  tags:
    - nvidia
  when:
    - deepops_gpu_operator_enabled|default(true) | bool == false or
      gpu_operator_preinstalled_nvidia_software|default(true)
    - container_manager is defined and container_manager == "docker"

# Manage kubectl binary
- hosts: kube_control_plane
  gather_facts: false
  vars:
    ansible_become: no
  tasks:
    - name: copy kubectl binary to ansible host
      synchronize:
        mode: pull
        src: "/usr/local/bin/kubectl"
        dest: "{{ artifacts_dir }}/kubectl"
      run_once: true
  tags:
    - local
- hosts: k8s_cluster
  gather_facts: false
  vars:
    config_dir: "../config"
  tasks:
    - name: check for kubectl
      stat:
        path: "{{ artifacts_dir }}/kubectl"
      register: kubectl_local
      become: no
      delegate_to: localhost
      run_once: true
    - name: modify kubectl permissions
      file:
        path: "{{ artifacts_dir }}/kubectl"
        mode: '0755'
      become: no
      when: kubectl_local.stat.exists
      delegate_to: localhost
      run_once: true
    - name: copy kubectl to /usr/local/bin
      copy:
        src: "{{ artifacts_dir }}/kubectl"
        dest: "/usr/local/bin/kubectl"
      when: kubectl_local.stat.exists
      become: yes
      ignore_errors: yes
      register: kubectl_copied
      delegate_to: localhost
      run_once: true
    - name: check for copied kubectl
      stat:
        path: "/usr/local/bin/kubectl"
      register: kubectl_system
      delegate_to: localhost
      run_once: true
    - name: modify kubectl permissions
      file:
        path: "/usr/local/bin/kubectl"
        owner: root
        group: root
        mode: '0755'
      become: yes
      ignore_errors: yes
      when: kubectl_system.stat.exists
      delegate_to: localhost
      run_once: true
    - name: manually move kubectl binary
      debug:
        msg: "Unable to move kubectl, run: sudo cp {{ artifacts_dir | realpath }}/kubectl /usr/local/bin"
      when: kubectl_copied is failed
      delegate_to: localhost
      run_once: true
  tags:
    - local

# Remove taint from kube_control_plane nodes.
# This keeps backwards compatibility and allows a few services (monitoring/etc.) to run properly.
- hosts: kube_control_plane
  gather_facts: false
  vars:
    ansible_become: no
  tasks:
    - name: Install Helm on admin node
      command: "sh {{ playbook_dir }}/../scripts/k8s/install_helm.sh"
      delegate_to: localhost
    - name: Globally update the deprecated "stable" helm repo
      command: "/usr/local/bin/helm repo add 'stable' 'https://charts.helm.sh/stable' --force-update"
      delegate_to: localhost
    - name: kubeadm | Remove taint for master with node role
      command: "{{ artifacts_dir }}/kubectl --kubeconfig {{ artifacts_dir }}/admin.conf taint node {{ inventory_hostname }} node-role.kubernetes.io/control-plane:NoSchedule-"
      delegate_to: localhost
      failed_when: false # Taint will not be present if kube_control_plane also under kube_node

- name: Install k8s GPU feature discovery
  import_playbook: k8s-cluster/nvidia-k8s-gpu-feature-discovery.yml
  tags:
    - nvidia
  when: deepops_gpu_operator_enabled|default(true) | bool == false

- name: Install k8s GPU device plugin
  import_playbook: k8s-cluster/nvidia-k8s-gpu-device-plugin.yml
  tags:
    - nvidia
  when: deepops_gpu_operator_enabled|default(true) | bool == false

- name: Install NVIDIA GPU Operator
  import_playbook: k8s-cluster/nvidia-gpu-operator.yml
  tags:
    - nvidia
  when: deepops_gpu_operator_enabled|default(true) | bool == true

- name: Setup a volume for NFS, install nfs_utils, and install NFS Helm chart
  import_playbook: k8s-cluster/nfs-client-provisioner.yml
  when: k8s_nfs_client_provisioner | default('false')

- name: Setup rsyslog server
  import_playbook: generic/rsyslog-server.yml
  vars:
    hostlist: "{{ rsyslog_server_hostname | default('kube_control_plane[0]') }}"
  when: kube_enable_rsyslog_server|default(true)

- name: Setup rsyslog client
  import_playbook: generic/rsyslog-client.yml
  vars:
    hostlist: "{{ rsyslog_client_group | default('k8s_cluster') }}"
  when: kube_enable_rsyslog_client|default(true)