]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/kubeadm.py
import ceph 16.2.7
[ceph.git] / ceph / qa / tasks / kubeadm.py
1 """
2 Kubernetes cluster task, deployed via kubeadm
3 """
4 import argparse
5 import contextlib
6 import ipaddress
7 import logging
8 import random
9 import yaml
10 from io import BytesIO
11
12 from teuthology import misc as teuthology
13 from teuthology import contextutil
14 from teuthology.config import config as teuth_config
15 from teuthology.orchestra import run
16
17 log = logging.getLogger(__name__)
18
19
20 def _kubectl(ctx, config, args, **kwargs):
21 cluster_name = config['cluster']
22 ctx.kubeadm[cluster_name].bootstrap_remote.run(
23 args=['kubectl'] + args,
24 **kwargs,
25 )
26
27
28 def kubectl(ctx, config):
29 if isinstance(config, str):
30 config = [config]
31 assert isinstance(config, list)
32 for c in config:
33 if isinstance(c, str):
34 _kubectl(ctx, config, c.split(' '))
35 else:
36 _kubectl(ctx, config, c)
37
38
39 @contextlib.contextmanager
40 def preflight(ctx, config):
41 run.wait(
42 ctx.cluster.run(
43 args=[
44 'sudo', 'modprobe', 'br_netfilter',
45 run.Raw('&&'),
46 'sudo', 'sysctl', 'net.bridge.bridge-nf-call-ip6tables=1',
47 run.Raw('&&'),
48 'sudo', 'sysctl', 'net.bridge.bridge-nf-call-iptables=1',
49 run.Raw('&&'),
50 'sudo', 'sysctl', 'net.ipv4.ip_forward=1',
51 run.Raw('&&'),
52 'sudo', 'swapoff', '-a',
53 ],
54 wait=False,
55 )
56 )
57
58 # set docker cgroup driver = systemd
59 # see https://kubernetes.io/docs/setup/production-environment/container-runtimes/#docker
60 # see https://github.com/kubernetes/kubeadm/issues/2066
61 daemon_json = """
62 {
63 "exec-opts": ["native.cgroupdriver=systemd"],
64 "log-driver": "json-file",
65 "log-opts": {
66 "max-size": "100m"
67 },
68 "storage-driver": "overlay2"
69 }
70 """
71 for remote in ctx.cluster.remotes.keys():
72 remote.write_file('/etc/docker/daemon.json', daemon_json, sudo=True)
73 run.wait(
74 ctx.cluster.run(
75 args=[
76 'sudo', 'systemctl', 'restart', 'docker',
77 run.Raw('||'),
78 'true',
79 ],
80 wait=False,
81 )
82 )
83 yield
84
85
86 @contextlib.contextmanager
87 def kubeadm_install(ctx, config):
88 version = config.get('version', '1.21')
89
90 os_type = teuthology.get_distro(ctx)
91 os_version = teuthology.get_distro_version(ctx)
92
93 try:
94 if os_type in ['centos', 'rhel']:
95 os = f"CentOS_{os_version.split('.')[0]}"
96 log.info('Installing cri-o')
97 run.wait(
98 ctx.cluster.run(
99 args=[
100 'sudo',
101 'curl', '-L', '-o',
102 '/etc/yum.repos.d/devel:kubic:libcontainers:stable.repo',
103 f'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/{os}/devel:kubic:libcontainers:stable.repo',
104 run.Raw('&&'),
105 'sudo',
106 'curl', '-L', '-o',
107 f'/etc/yum.repos.d/devel:kubic:libcontainers:stable:cri-o:{version}.repo',
108 f'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/{version}/{os}/devel:kubic:libcontainers:stable:cri-o:{version}.repo',
109 run.Raw('&&'),
110 'sudo', 'dnf', 'install', '-y', 'cri-o',
111 ],
112 wait=False,
113 )
114 )
115
116 log.info('Installing kube{adm,ctl,let}')
117 repo = """[kubernetes]
118 name=Kubernetes
119 baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-$basearch
120 enabled=1
121 gpgcheck=1
122 repo_gpgcheck=1
123 gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
124 """
125 for remote in ctx.cluster.remotes.keys():
126 remote.write_file(
127 '/etc/yum.repos.d/kubernetes.repo',
128 repo,
129 sudo=True,
130 )
131 run.wait(
132 ctx.cluster.run(
133 args=[
134 'sudo', 'dnf', 'install', '-y',
135 'kubelet', 'kubeadm', 'kubectl',
136 'iproute-tc', 'bridge-utils',
137 ],
138 wait=False,
139 )
140 )
141
142 # fix cni config
143 for remote in ctx.cluster.remotes.keys():
144 conf = """# from https://github.com/cri-o/cri-o/blob/master/tutorials/kubernetes.md#flannel-network
145 {
146 "name": "crio",
147 "type": "flannel"
148 }
149 """
150 remote.write_file('/etc/cni/net.d/10-crio-flannel.conf', conf, sudo=True)
151 remote.run(args=[
152 'sudo', 'rm', '-f',
153 '/etc/cni/net.d/87-podman-bridge.conflist',
154 '/etc/cni/net.d/100-crio-bridge.conf',
155 ])
156
157 # start crio
158 run.wait(
159 ctx.cluster.run(
160 args=[
161 'sudo', 'systemctl', 'daemon-reload',
162 run.Raw('&&'),
163 'sudo', 'systemctl', 'enable', 'crio', '--now',
164 ],
165 wait=False,
166 )
167 )
168
169 elif os_type == 'ubuntu':
170 os = f"xUbuntu_{os_version}"
171 log.info('Installing kube{adm,ctl,let}')
172 run.wait(
173 ctx.cluster.run(
174 args=[
175 'sudo', 'apt', 'update',
176 run.Raw('&&'),
177 'sudo', 'apt', 'install', '-y',
178 'apt-transport-https', 'ca-certificates', 'curl',
179 run.Raw('&&'),
180 'sudo', 'curl', '-fsSLo',
181 '/usr/share/keyrings/kubernetes-archive-keyring.gpg',
182 'https://packages.cloud.google.com/apt/doc/apt-key.gpg',
183 run.Raw('&&'),
184 'echo', 'deb [signed-by=/usr/share/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main',
185 run.Raw('|'),
186 'sudo', 'tee', '/etc/apt/sources.list.d/kubernetes.list',
187 run.Raw('&&'),
188 'sudo', 'apt', 'update',
189 run.Raw('&&'),
190 'sudo', 'apt', 'install', '-y',
191 'kubelet', 'kubeadm', 'kubectl',
192 'bridge-utils',
193 ],
194 wait=False,
195 )
196 )
197
198 else:
199 raise RuntimeError(f'unsupported distro {os_type} for cri-o')
200
201 run.wait(
202 ctx.cluster.run(
203 args=[
204 'sudo', 'systemctl', 'enable', '--now', 'kubelet',
205 run.Raw('&&'),
206 'sudo', 'kubeadm', 'config', 'images', 'pull',
207 ],
208 wait=False,
209 )
210 )
211
212 yield
213
214 finally:
215 if config.get('uninstall', True):
216 log.info('Uninstalling kube{adm,let,ctl}')
217 if os_type in ['centos', 'rhel']:
218 run.wait(
219 ctx.cluster.run(
220 args=[
221 'sudo', 'rm', '-f',
222 '/etc/yum.repos.d/kubernetes.repo',
223 run.Raw('&&'),
224 'sudo', 'dnf', 'remove', '-y',
225 'kubeadm', 'kubelet', 'kubectl', 'cri-o',
226 ],
227 wait=False
228 )
229 )
230 elif os_type == 'ubuntu' and False:
231 run.wait(
232 ctx.cluster.run(
233 args=[
234 'sudo', 'rm', '-f',
235 '/etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list',
236 f'/etc/apt/sources.list.d/devel:kubic:libcontainers:stable:cri-o:{version}.list',
237 '/etc/apt/trusted.gpg.d/libcontainers-cri-o.gpg',
238 run.Raw('&&'),
239 'sudo', 'apt', 'remove', '-y',
240 'kkubeadm', 'kubelet', 'kubectl', 'cri-o', 'cri-o-runc',
241 ],
242 wait=False,
243 )
244 )
245
246
247 @contextlib.contextmanager
248 def kubeadm_init_join(ctx, config):
249 cluster_name = config['cluster']
250
251 bootstrap_remote = None
252 remotes = {} # remote -> ip
253 for remote, roles in ctx.cluster.remotes.items():
254 for role in roles:
255 if role.startswith('host.'):
256 if not bootstrap_remote:
257 bootstrap_remote = remote
258 if remote not in remotes:
259 remotes[remote] = remote.ssh.get_transport().getpeername()[0]
260 if not bootstrap_remote:
261 raise RuntimeError('must define at least one host.something role')
262 ctx.kubeadm[cluster_name].bootstrap_remote = bootstrap_remote
263 ctx.kubeadm[cluster_name].remotes = remotes
264 ctx.kubeadm[cluster_name].token = 'abcdef.' + ''.join([
265 random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for _ in range(16)
266 ])
267 log.info(f'Token: {ctx.kubeadm[cluster_name].token}')
268 log.info(f'Remotes: {ctx.kubeadm[cluster_name].remotes}')
269
270 try:
271 # init
272 cmd = [
273 'sudo', 'kubeadm', 'init',
274 '--node-name', ctx.kubeadm[cluster_name].bootstrap_remote.shortname,
275 '--token', ctx.kubeadm[cluster_name].token,
276 '--pod-network-cidr', str(ctx.kubeadm[cluster_name].pod_subnet),
277 ]
278 bootstrap_remote.run(args=cmd)
279
280 # join additional nodes
281 joins = []
282 for remote, ip in ctx.kubeadm[cluster_name].remotes.items():
283 if remote == bootstrap_remote:
284 continue
285 cmd = [
286 'sudo', 'kubeadm', 'join',
287 ctx.kubeadm[cluster_name].remotes[ctx.kubeadm[cluster_name].bootstrap_remote] + ':6443',
288 '--node-name', remote.shortname,
289 '--token', ctx.kubeadm[cluster_name].token,
290 '--discovery-token-unsafe-skip-ca-verification',
291 ]
292 joins.append(remote.run(args=cmd, wait=False))
293 run.wait(joins)
294 yield
295
296 except Exception as e:
297 log.exception(e)
298 raise
299
300 finally:
301 log.info('Cleaning up node')
302 run.wait(
303 ctx.cluster.run(
304 args=['sudo', 'kubeadm', 'reset', 'cleanup-node', '-f'],
305 wait=False,
306 )
307 )
308
309
310 @contextlib.contextmanager
311 def kubectl_config(ctx, config):
312 cluster_name = config['cluster']
313 bootstrap_remote = ctx.kubeadm[cluster_name].bootstrap_remote
314
315 ctx.kubeadm[cluster_name].admin_conf = \
316 bootstrap_remote.read_file('/etc/kubernetes/admin.conf', sudo=True)
317
318 log.info('Setting up kubectl')
319 try:
320 ctx.cluster.run(args=[
321 'mkdir', '-p', '.kube',
322 run.Raw('&&'),
323 'sudo', 'mkdir', '-p', '/root/.kube',
324 ])
325 for remote in ctx.kubeadm[cluster_name].remotes.keys():
326 remote.write_file('.kube/config', ctx.kubeadm[cluster_name].admin_conf)
327 remote.sudo_write_file('/root/.kube/config',
328 ctx.kubeadm[cluster_name].admin_conf)
329 yield
330
331 except Exception as e:
332 log.exception(e)
333 raise
334
335 finally:
336 log.info('Deconfiguring kubectl')
337 ctx.cluster.run(args=[
338 'rm', '-rf', '.kube',
339 run.Raw('&&'),
340 'sudo', 'rm', '-rf', '/root/.kube',
341 ])
342
343
344 def map_vnet(mip):
345 for mapping in teuth_config.get('vnet', []):
346 mnet = ipaddress.ip_network(mapping['machine_subnet'])
347 vnet = ipaddress.ip_network(mapping['virtual_subnet'])
348 if vnet.prefixlen >= mnet.prefixlen:
349 log.error(f"virtual_subnet {vnet} prefix >= machine_subnet {mnet} prefix")
350 return None
351 if mip in mnet:
352 pos = list(mnet.hosts()).index(mip)
353 log.info(f"{mip} is in {mnet} at pos {pos}")
354 sub = list(vnet.subnets(32 - mnet.prefixlen))[pos]
355 return sub
356 return None
357
358
359 @contextlib.contextmanager
360 def allocate_pod_subnet(ctx, config):
361 """
362 Allocate a private subnet that will not collide with other test machines/clusters
363 """
364 cluster_name = config['cluster']
365 assert cluster_name == 'kubeadm', 'multiple subnets not yet implemented'
366
367 log.info('Identifying pod subnet')
368 remote = list(ctx.cluster.remotes.keys())[0]
369 ip = remote.ssh.get_transport().getpeername()[0]
370 mip = ipaddress.ip_address(ip)
371 vnet = map_vnet(mip)
372 assert vnet
373 log.info(f'Pod subnet: {vnet}')
374 ctx.kubeadm[cluster_name].pod_subnet = vnet
375 yield
376
377
378 @contextlib.contextmanager
379 def pod_network(ctx, config):
380 cluster_name = config['cluster']
381 pnet = config.get('pod_network', 'calico')
382 if pnet == 'flannel':
383 r = ctx.kubeadm[cluster_name].bootstrap_remote.run(
384 args=[
385 'curl',
386 'https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml',
387 ],
388 stdout=BytesIO(),
389 )
390 assert r.exitstatus == 0
391 flannel = list(yaml.load_all(r.stdout.getvalue(), Loader=yaml.FullLoader))
392 for o in flannel:
393 if o.get('data', {}).get('net-conf.json'):
394 log.info(f'Updating {o}')
395 o['data']['net-conf.json'] = o['data']['net-conf.json'].replace(
396 '10.244.0.0/16',
397 str(ctx.kubeadm[cluster_name].pod_subnet)
398 )
399 log.info(f'Now {o}')
400 flannel_yaml = yaml.dump_all(flannel)
401 log.debug(f'Flannel:\n{flannel_yaml}')
402 _kubectl(ctx, config, ['apply', '-f', '-'], stdin=flannel_yaml)
403
404 elif pnet == 'calico':
405 _kubectl(ctx, config, [
406 'apply', '-f',
407 'https://docs.projectcalico.org/manifests/tigera-operator.yaml'
408 ])
409 cr = {
410 'apiVersion': 'operator.tigera.io/v1',
411 'kind': 'Installation',
412 'metadata': {'name': 'default'},
413 'spec': {
414 'calicoNetwork': {
415 'ipPools': [
416 {
417 'blockSize': 26,
418 'cidr': str(ctx.kubeadm[cluster_name].pod_subnet),
419 'encapsulation': 'VXLANCrossSubnet',
420 'natOutgoing': 'Enabled',
421 'nodeSelector': 'all()',
422 }
423 ]
424 }
425 }
426 }
427 _kubectl(ctx, config, ['create', '-f', '-'], stdin=yaml.dump(cr))
428
429 else:
430 raise RuntimeError(f'unrecognized pod_network {pnet}')
431
432 try:
433 yield
434
435 finally:
436 if pnet == 'flannel':
437 _kubectl(ctx, config, [
438 'delete', '-f',
439 'https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml',
440 ])
441
442 elif pnet == 'calico':
443 _kubectl(ctx, config, ['delete', 'installation', 'default'])
444 _kubectl(ctx, config, [
445 'delete', '-f',
446 'https://docs.projectcalico.org/manifests/tigera-operator.yaml'
447 ])
448
449
450 @contextlib.contextmanager
451 def setup_pvs(ctx, config):
452 """
453 Create PVs for all scratch LVs and set up a trivial provisioner
454 """
455 log.info('Scanning for scratch devices')
456 crs = []
457 for remote in ctx.cluster.remotes.keys():
458 ls = remote.read_file('/scratch_devs').decode('utf-8').strip().splitlines()
459 log.info(f'Scratch devices on {remote.shortname}: {ls}')
460 for dev in ls:
461 devname = dev.split('/')[-1].replace("_", "-")
462 crs.append({
463 'apiVersion': 'v1',
464 'kind': 'PersistentVolume',
465 'metadata': {'name': f'{remote.shortname}-{devname}'},
466 'spec': {
467 'volumeMode': 'Block',
468 'accessModes': ['ReadWriteOnce'],
469 'capacity': {'storage': '100Gi'}, # doesn't matter?
470 'persistentVolumeReclaimPolicy': 'Recycle',
471 'storageClassName': 'scratch',
472 'local': {'path': dev},
473 'nodeAffinity': {
474 'required': {
475 'nodeSelectorTerms': [
476 {
477 'matchExpressions': [
478 {
479 'key': 'kubernetes.io/hostname',
480 'operator': 'In',
481 'values': [remote.shortname]
482 }
483 ]
484 }
485 ]
486 }
487 }
488 }
489 })
490 # overwriting first few MB is enough to make k8s happy
491 remote.run(args=[
492 'sudo', 'dd', 'if=/dev/zero', f'of={dev}', 'bs=1M', 'count=10'
493 ])
494 crs.append({
495 'kind': 'StorageClass',
496 'apiVersion': 'storage.k8s.io/v1',
497 'metadata': {'name': 'scratch'},
498 'provisioner': 'kubernetes.io/no-provisioner',
499 'volumeBindingMode': 'WaitForFirstConsumer',
500 })
501 y = yaml.dump_all(crs)
502 log.info('Creating PVs + StorageClass')
503 log.debug(y)
504 _kubectl(ctx, config, ['create', '-f', '-'], stdin=y)
505
506 yield
507
508
509 @contextlib.contextmanager
510 def final(ctx, config):
511 cluster_name = config['cluster']
512
513 # remove master node taint
514 _kubectl(ctx, config, [
515 'taint', 'node',
516 ctx.kubeadm[cluster_name].bootstrap_remote.shortname,
517 'node-role.kubernetes.io/master-',
518 run.Raw('||'),
519 'true',
520 ])
521
522 yield
523
524
525 @contextlib.contextmanager
526 def task(ctx, config):
527 if not config:
528 config = {}
529 assert isinstance(config, dict), \
530 "task only supports a dictionary for configuration"
531
532 log.info('Kubeadm start')
533
534 overrides = ctx.config.get('overrides', {})
535 teuthology.deep_merge(config, overrides.get('kubeadm', {}))
536 log.info('Config: ' + str(config))
537
538 # set up cluster context
539 if not hasattr(ctx, 'kubeadm'):
540 ctx.kubeadm = {}
541 if 'cluster' not in config:
542 config['cluster'] = 'kubeadm'
543 cluster_name = config['cluster']
544 if cluster_name not in ctx.kubeadm:
545 ctx.kubeadm[cluster_name] = argparse.Namespace()
546
547 with contextutil.nested(
548 lambda: preflight(ctx, config),
549 lambda: allocate_pod_subnet(ctx, config),
550 lambda: kubeadm_install(ctx, config),
551 lambda: kubeadm_init_join(ctx, config),
552 lambda: kubectl_config(ctx, config),
553 lambda: pod_network(ctx, config),
554 lambda: setup_pvs(ctx, config),
555 lambda: final(ctx, config),
556 ):
557 try:
558 log.info('Kubeadm complete, yielding')
559 yield
560
561 finally:
562 log.info('Tearing down kubeadm')