forked from NVIDIA/deepops
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathk8s-cluster.yml
More file actions
305 lines (281 loc) · 9.08 KB
/
k8s-cluster.yml
File metadata and controls
305 lines (281 loc) · 9.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
---
# Kubernetes Cluster Playbook
# Set facts depending on container runtime
# Use GPU operator when container runtime is not docker
# etcd_deployment_type must be `host` when container_manager is not `docker`
- hosts: all
tasks:
- name: Set facts when not using docker container runtime (default)
set_fact:
deepops_gpu_operator_enabled: true
etcd_deployment_type: host
when:
- container_manager is defined
- container_manager != "docker"
- name: Set facts when using Docker container runtime
set_fact:
etcd_deployment_type: docker
gpu_operator_default_runtime: "docker"
when:
- container_manager is defined
- container_manager == "docker"
- name: Install python required for Ansible
import_playbook: bootstrap/bootstrap-python.yml
tags:
- bootstrap
# Make sure Kubespray submodule is correct
- hosts: all
gather_facts: false
tasks:
- name: make sure kubespray is at the correct version
command: git submodule update --init
args:
chdir: "{{ playbook_dir | dirname }}"
delegate_to: localhost
run_once: true
vars:
ansible_become: no
ansible_connection: local
tags:
- local
- name: Set up SSH keys if needed
import_playbook: bootstrap/bootstrap-ssh.yml
tags:
- bootstrap
- name: Set up passwordless sudo if needed
import_playbook: bootstrap/bootstrap-sudo.yml
tags:
- bootstrap
- name: Disable cloud-init
import_playbook: generic/disable-cloud-init.yml
when: deepops_disable_cloud_init|default(true)
- name: Configure Chrony (NTP) sync
import_playbook: generic/chrony-client.yml
when: chrony_install|default(true)
- name: Install the OpenShift API libraries required by the GPU plugin
import_playbook: bootstrap/bootstrap-openshift.yml
tags:
- bootstrap
- name: Configure hostnames, /etc/hosts
import_playbook: generic/hosts.yml
- name: Set up a local cluster container registry
import_playbook: container/standalone-container-registry.yml hostlist=kube_control_plane
when: kube_enable_container_registry|default(false)
# Install 'sshpass' program for: https://github.com/ansible/ansible/issues/56629
- hosts: all
gather_facts: true
tasks:
- name: install epel
package:
name: epel-release
state: present
when: ansible_os_family == "RedHat"
- name: install sshpass
package:
name: sshpass
state: present
environment: "{{proxy_env if proxy_env is defined else {}}}"
tags:
- bootstrap
# Un-hold container runtime packages on Ubuntu
# In some cases, container runtime packages can be in the 'held' state, preventing
# them from being removed and causing the install of the containerd runtime to fail
- hosts: all
gather_facts: true
tasks:
- name: un-hold container runtime packages on Ubuntu
dpkg_selections:
name: "{{ item }}"
selection: purge
with_items:
- docker-ce
- docker-ce-cli
- docker-ce-rootless-extras
- containerd.io
when:
- container_manager is defined and container_manager != "docker"
- ansible_distribution == "Ubuntu"
ignore_errors: yes
environment: "{{proxy_env if proxy_env is defined else {}}}"
# Install Kubernetes
# for configuration, see: config/group_vars/k8s_cluster.yml
- name: Install Kubernetes
import_playbook: ../submodules/kubespray/cluster.yml
tags:
- k8s
# Disable swap (required for k8s), kubespray method doesn't quite cut it
- hosts: all
become: true
tasks:
- name: remove swap from fstab
lineinfile: path=/etc/fstab regexp='swap' state=absent
- name: disable swap
command: swapoff -a
tags:
- swap
# Manage Kubernetes cluster access config file
- hosts: k8s_cluster
gather_facts: false
vars:
ansible_become: no
tasks:
- name: create artifacts directory
file:
path: "{{ artifacts_dir }}"
state: directory
delegate_to: localhost
run_once: true
- name: create kube config directory for current user
file:
path: "{{ lookup('env','HOME') + '/.kube/' }}"
state: directory
delegate_to: localhost
run_once: true
- name: check for kube config file
stat:
path: "{{ artifacts_dir }}/admin.conf"
register: kubeconf
delegate_to: localhost
run_once: true
- name: copy kube config file for current user
copy:
src: "{{ artifacts_dir }}/admin.conf"
dest: "{{ lookup('env','HOME') + '/.kube/config' }}"
backup: yes
when: kubeconf.stat.exists
delegate_to: localhost
run_once: true
tags:
- local
- name: Install NVIDIA driver on GPU servers
import_playbook: nvidia-software/nvidia-driver.yml
vars:
hostlist: "k8s_cluster"
tags:
- nvidia
when: deepops_gpu_operator_enabled|default(true) | bool == false or
gpu_operator_preinstalled_nvidia_software|default(true)
- name: Install NVIDIA container runtime on GPU servers
import_playbook: container/nvidia-docker.yml
vars:
hostlist: "k8s_cluster"
tags:
- nvidia
when:
- deepops_gpu_operator_enabled|default(true) | bool == false or
gpu_operator_preinstalled_nvidia_software|default(true)
- container_manager is defined and container_manager == "docker"
# Manage kubectl binary
- hosts: kube_control_plane
gather_facts: false
vars:
ansible_become: no
tasks:
- name: copy kubectl binary to ansible host
synchronize:
mode: pull
src: "/usr/local/bin/kubectl"
dest: "{{ artifacts_dir }}/kubectl"
run_once: true
tags:
- local
- hosts: k8s_cluster
gather_facts: false
vars:
config_dir: "../config"
tasks:
- name: check for kubectl
stat:
path: "{{ artifacts_dir }}/kubectl"
register: kubectl_local
become: no
delegate_to: localhost
run_once: true
- name: modify kubectl permissions
file:
path: "{{ artifacts_dir }}/kubectl"
mode: '0755'
become: no
when: kubectl_local.stat.exists
delegate_to: localhost
run_once: true
- name: copy kubectl to /usr/local/bin
copy:
src: "{{ artifacts_dir }}/kubectl"
dest: "/usr/local/bin/kubectl"
when: kubectl_local.stat.exists
become: yes
ignore_errors: yes
register: kubectl_copied
delegate_to: localhost
run_once: true
- name: check for copied kubectl
stat:
path: "/usr/local/bin/kubectl"
register: kubectl_system
delegate_to: localhost
run_once: true
- name: modify kubectl permissions
file:
path: "/usr/local/bin/kubectl"
owner: root
group: root
mode: '0755'
become: yes
ignore_errors: yes
when: kubectl_system.stat.exists
delegate_to: localhost
run_once: true
- name: manually move kubectl binary
debug:
msg: "Unable to move kubectl, run: sudo cp {{ artifacts_dir | realpath }}/kubectl /usr/local/bin"
when: kubectl_copied is failed
delegate_to: localhost
run_once: true
tags:
- local
# Remove taint from kube_control_plane nodes.
# This keeps backwards compatibility and allows a few services (monitoring/etc.) to run properly.
- hosts: kube_control_plane
gather_facts: false
vars:
ansible_become: no
tasks:
- name: Install Helm on admin node
command: "sh {{ playbook_dir }}/../scripts/k8s/install_helm.sh"
delegate_to: localhost
- name: Globally update the deprecated "stable" helm repo
command: "/usr/local/bin/helm repo add 'stable' 'https://charts.helm.sh/stable' --force-update"
delegate_to: localhost
- name: kubeadm | Remove taint for master with node role
command: "{{ artifacts_dir }}/kubectl --kubeconfig {{ artifacts_dir }}/admin.conf taint node {{ inventory_hostname }} node-role.kubernetes.io/control-plane:NoSchedule-"
delegate_to: localhost
failed_when: false # Taint will not be present if kube_control_plane also under kube_node
- name: Install k8s GPU feature discovery
import_playbook: k8s-cluster/nvidia-k8s-gpu-feature-discovery.yml
tags:
- nvidia
when: deepops_gpu_operator_enabled|default(true) | bool == false
- name: Install k8s GPU device plugin
import_playbook: k8s-cluster/nvidia-k8s-gpu-device-plugin.yml
tags:
- nvidia
when: deepops_gpu_operator_enabled|default(true) | bool == false
- name: Install NVIDIA GPU Operator
import_playbook: k8s-cluster/nvidia-gpu-operator.yml
tags:
- nvidia
when: deepops_gpu_operator_enabled|default(true) | bool == true
- name: Setup a volume for NFS, install nfs_utils, and install NFS Helm chart
import_playbook: k8s-cluster/nfs-client-provisioner.yml
when: k8s_nfs_client_provisioner | default('false')
- name: Setup rsyslog server
import_playbook: generic/rsyslog-server.yml
vars:
hostlist: "{{ rsyslog_server_hostname | default('kube_control_plane[0]') }}"
when: kube_enable_rsyslog_server|default(true)
- name: Setup rsyslog client
import_playbook: generic/rsyslog-client.yml
vars:
hostlist: "{{ rsyslog_client_group | default('k8s_cluster') }}"
when: kube_enable_rsyslog_client|default(true)