]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/kubeadm.py
import ceph 16.2.7
[ceph.git] / ceph / qa / tasks / kubeadm.py
CommitLineData
b3b6e05e
TL
1"""
2Kubernetes cluster task, deployed via kubeadm
3"""
4import argparse
5import contextlib
6import ipaddress
7import logging
8import random
9import yaml
10from io import BytesIO
11
12from teuthology import misc as teuthology
13from teuthology import contextutil
14from teuthology.config import config as teuth_config
15from teuthology.orchestra import run
16
17log = logging.getLogger(__name__)
18
19
20def _kubectl(ctx, config, args, **kwargs):
21 cluster_name = config['cluster']
22 ctx.kubeadm[cluster_name].bootstrap_remote.run(
23 args=['kubectl'] + args,
24 **kwargs,
25 )
26
27
28def kubectl(ctx, config):
29 if isinstance(config, str):
30 config = [config]
31 assert isinstance(config, list)
32 for c in config:
33 if isinstance(c, str):
34 _kubectl(ctx, config, c.split(' '))
35 else:
36 _kubectl(ctx, config, c)
37
38
39@contextlib.contextmanager
40def preflight(ctx, config):
41 run.wait(
42 ctx.cluster.run(
43 args=[
44 'sudo', 'modprobe', 'br_netfilter',
45 run.Raw('&&'),
46 'sudo', 'sysctl', 'net.bridge.bridge-nf-call-ip6tables=1',
47 run.Raw('&&'),
48 'sudo', 'sysctl', 'net.bridge.bridge-nf-call-iptables=1',
49 run.Raw('&&'),
50 'sudo', 'sysctl', 'net.ipv4.ip_forward=1',
51 run.Raw('&&'),
52 'sudo', 'swapoff', '-a',
53 ],
54 wait=False,
55 )
56 )
a4b75251
TL
57
58 # set docker cgroup driver = systemd
59 # see https://kubernetes.io/docs/setup/production-environment/container-runtimes/#docker
60 # see https://github.com/kubernetes/kubeadm/issues/2066
61 daemon_json = """
62{
63 "exec-opts": ["native.cgroupdriver=systemd"],
64 "log-driver": "json-file",
65 "log-opts": {
66 "max-size": "100m"
67 },
68 "storage-driver": "overlay2"
69}
70"""
71 for remote in ctx.cluster.remotes.keys():
72 remote.write_file('/etc/docker/daemon.json', daemon_json, sudo=True)
73 run.wait(
74 ctx.cluster.run(
75 args=[
76 'sudo', 'systemctl', 'restart', 'docker',
77 run.Raw('||'),
78 'true',
79 ],
80 wait=False,
81 )
82 )
b3b6e05e
TL
83 yield
84
85
86@contextlib.contextmanager
87def kubeadm_install(ctx, config):
88 version = config.get('version', '1.21')
89
90 os_type = teuthology.get_distro(ctx)
91 os_version = teuthology.get_distro_version(ctx)
92
93 try:
94 if os_type in ['centos', 'rhel']:
95 os = f"CentOS_{os_version.split('.')[0]}"
96 log.info('Installing cri-o')
97 run.wait(
98 ctx.cluster.run(
99 args=[
100 'sudo',
101 'curl', '-L', '-o',
102 '/etc/yum.repos.d/devel:kubic:libcontainers:stable.repo',
103 f'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/{os}/devel:kubic:libcontainers:stable.repo',
104 run.Raw('&&'),
105 'sudo',
106 'curl', '-L', '-o',
107 f'/etc/yum.repos.d/devel:kubic:libcontainers:stable:cri-o:{version}.repo',
108 f'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/{version}/{os}/devel:kubic:libcontainers:stable:cri-o:{version}.repo',
109 run.Raw('&&'),
110 'sudo', 'dnf', 'install', '-y', 'cri-o',
111 ],
112 wait=False,
113 )
114 )
115
116 log.info('Installing kube{adm,ctl,let}')
117 repo = """[kubernetes]
118name=Kubernetes
119baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-$basearch
120enabled=1
121gpgcheck=1
122repo_gpgcheck=1
123gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
124"""
125 for remote in ctx.cluster.remotes.keys():
126 remote.write_file(
127 '/etc/yum.repos.d/kubernetes.repo',
128 repo,
129 sudo=True,
130 )
131 run.wait(
132 ctx.cluster.run(
133 args=[
134 'sudo', 'dnf', 'install', '-y',
135 'kubelet', 'kubeadm', 'kubectl',
136 'iproute-tc', 'bridge-utils',
137 ],
138 wait=False,
139 )
140 )
141
142 # fix cni config
143 for remote in ctx.cluster.remotes.keys():
144 conf = """# from https://github.com/cri-o/cri-o/blob/master/tutorials/kubernetes.md#flannel-network
145{
146 "name": "crio",
147 "type": "flannel"
148}
149"""
150 remote.write_file('/etc/cni/net.d/10-crio-flannel.conf', conf, sudo=True)
151 remote.run(args=[
152 'sudo', 'rm', '-f',
153 '/etc/cni/net.d/87-podman-bridge.conflist',
154 '/etc/cni/net.d/100-crio-bridge.conf',
155 ])
156
157 # start crio
158 run.wait(
159 ctx.cluster.run(
160 args=[
161 'sudo', 'systemctl', 'daemon-reload',
162 run.Raw('&&'),
163 'sudo', 'systemctl', 'enable', 'crio', '--now',
164 ],
165 wait=False,
166 )
167 )
168
169 elif os_type == 'ubuntu':
170 os = f"xUbuntu_{os_version}"
171 log.info('Installing kube{adm,ctl,let}')
172 run.wait(
173 ctx.cluster.run(
174 args=[
175 'sudo', 'apt', 'update',
176 run.Raw('&&'),
177 'sudo', 'apt', 'install', '-y',
178 'apt-transport-https', 'ca-certificates', 'curl',
179 run.Raw('&&'),
180 'sudo', 'curl', '-fsSLo',
181 '/usr/share/keyrings/kubernetes-archive-keyring.gpg',
182 'https://packages.cloud.google.com/apt/doc/apt-key.gpg',
183 run.Raw('&&'),
184 'echo', 'deb [signed-by=/usr/share/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main',
185 run.Raw('|'),
186 'sudo', 'tee', '/etc/apt/sources.list.d/kubernetes.list',
187 run.Raw('&&'),
188 'sudo', 'apt', 'update',
189 run.Raw('&&'),
190 'sudo', 'apt', 'install', '-y',
191 'kubelet', 'kubeadm', 'kubectl',
192 'bridge-utils',
193 ],
194 wait=False,
195 )
196 )
197
198 else:
199 raise RuntimeError(f'unsupported distro {os_type} for cri-o')
200
201 run.wait(
202 ctx.cluster.run(
203 args=[
204 'sudo', 'systemctl', 'enable', '--now', 'kubelet',
205 run.Raw('&&'),
206 'sudo', 'kubeadm', 'config', 'images', 'pull',
207 ],
208 wait=False,
209 )
210 )
211
212 yield
213
214 finally:
215 if config.get('uninstall', True):
216 log.info('Uninstalling kube{adm,let,ctl}')
217 if os_type in ['centos', 'rhel']:
218 run.wait(
219 ctx.cluster.run(
220 args=[
221 'sudo', 'rm', '-f',
222 '/etc/yum.repos.d/kubernetes.repo',
223 run.Raw('&&'),
224 'sudo', 'dnf', 'remove', '-y',
225 'kubeadm', 'kubelet', 'kubectl', 'cri-o',
226 ],
227 wait=False
228 )
229 )
230 elif os_type == 'ubuntu' and False:
231 run.wait(
232 ctx.cluster.run(
233 args=[
234 'sudo', 'rm', '-f',
235 '/etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list',
236 f'/etc/apt/sources.list.d/devel:kubic:libcontainers:stable:cri-o:{version}.list',
237 '/etc/apt/trusted.gpg.d/libcontainers-cri-o.gpg',
238 run.Raw('&&'),
239 'sudo', 'apt', 'remove', '-y',
240 'kkubeadm', 'kubelet', 'kubectl', 'cri-o', 'cri-o-runc',
241 ],
242 wait=False,
243 )
244 )
245
246
247@contextlib.contextmanager
248def kubeadm_init_join(ctx, config):
249 cluster_name = config['cluster']
250
251 bootstrap_remote = None
252 remotes = {} # remote -> ip
253 for remote, roles in ctx.cluster.remotes.items():
254 for role in roles:
255 if role.startswith('host.'):
256 if not bootstrap_remote:
257 bootstrap_remote = remote
258 if remote not in remotes:
259 remotes[remote] = remote.ssh.get_transport().getpeername()[0]
260 if not bootstrap_remote:
261 raise RuntimeError('must define at least one host.something role')
262 ctx.kubeadm[cluster_name].bootstrap_remote = bootstrap_remote
263 ctx.kubeadm[cluster_name].remotes = remotes
264 ctx.kubeadm[cluster_name].token = 'abcdef.' + ''.join([
265 random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for _ in range(16)
266 ])
267 log.info(f'Token: {ctx.kubeadm[cluster_name].token}')
268 log.info(f'Remotes: {ctx.kubeadm[cluster_name].remotes}')
269
270 try:
271 # init
272 cmd = [
273 'sudo', 'kubeadm', 'init',
274 '--node-name', ctx.kubeadm[cluster_name].bootstrap_remote.shortname,
275 '--token', ctx.kubeadm[cluster_name].token,
276 '--pod-network-cidr', str(ctx.kubeadm[cluster_name].pod_subnet),
277 ]
278 bootstrap_remote.run(args=cmd)
279
280 # join additional nodes
281 joins = []
282 for remote, ip in ctx.kubeadm[cluster_name].remotes.items():
283 if remote == bootstrap_remote:
284 continue
285 cmd = [
286 'sudo', 'kubeadm', 'join',
287 ctx.kubeadm[cluster_name].remotes[ctx.kubeadm[cluster_name].bootstrap_remote] + ':6443',
288 '--node-name', remote.shortname,
289 '--token', ctx.kubeadm[cluster_name].token,
290 '--discovery-token-unsafe-skip-ca-verification',
291 ]
292 joins.append(remote.run(args=cmd, wait=False))
293 run.wait(joins)
294 yield
295
296 except Exception as e:
297 log.exception(e)
298 raise
299
300 finally:
301 log.info('Cleaning up node')
302 run.wait(
303 ctx.cluster.run(
304 args=['sudo', 'kubeadm', 'reset', 'cleanup-node', '-f'],
305 wait=False,
306 )
307 )
308
309
310@contextlib.contextmanager
311def kubectl_config(ctx, config):
312 cluster_name = config['cluster']
313 bootstrap_remote = ctx.kubeadm[cluster_name].bootstrap_remote
314
315 ctx.kubeadm[cluster_name].admin_conf = \
316 bootstrap_remote.read_file('/etc/kubernetes/admin.conf', sudo=True)
317
318 log.info('Setting up kubectl')
319 try:
320 ctx.cluster.run(args=[
321 'mkdir', '-p', '.kube',
322 run.Raw('&&'),
323 'sudo', 'mkdir', '-p', '/root/.kube',
324 ])
325 for remote in ctx.kubeadm[cluster_name].remotes.keys():
326 remote.write_file('.kube/config', ctx.kubeadm[cluster_name].admin_conf)
327 remote.sudo_write_file('/root/.kube/config',
328 ctx.kubeadm[cluster_name].admin_conf)
329 yield
330
331 except Exception as e:
332 log.exception(e)
333 raise
334
335 finally:
336 log.info('Deconfiguring kubectl')
337 ctx.cluster.run(args=[
338 'rm', '-rf', '.kube',
339 run.Raw('&&'),
340 'sudo', 'rm', '-rf', '/root/.kube',
341 ])
342
343
344def map_vnet(mip):
345 for mapping in teuth_config.get('vnet', []):
346 mnet = ipaddress.ip_network(mapping['machine_subnet'])
347 vnet = ipaddress.ip_network(mapping['virtual_subnet'])
348 if vnet.prefixlen >= mnet.prefixlen:
349 log.error(f"virtual_subnet {vnet} prefix >= machine_subnet {mnet} prefix")
350 return None
351 if mip in mnet:
352 pos = list(mnet.hosts()).index(mip)
353 log.info(f"{mip} is in {mnet} at pos {pos}")
354 sub = list(vnet.subnets(32 - mnet.prefixlen))[pos]
355 return sub
356 return None
357
358
359@contextlib.contextmanager
360def allocate_pod_subnet(ctx, config):
361 """
362 Allocate a private subnet that will not collide with other test machines/clusters
363 """
364 cluster_name = config['cluster']
365 assert cluster_name == 'kubeadm', 'multiple subnets not yet implemented'
366
367 log.info('Identifying pod subnet')
368 remote = list(ctx.cluster.remotes.keys())[0]
369 ip = remote.ssh.get_transport().getpeername()[0]
370 mip = ipaddress.ip_address(ip)
371 vnet = map_vnet(mip)
372 assert vnet
373 log.info(f'Pod subnet: {vnet}')
374 ctx.kubeadm[cluster_name].pod_subnet = vnet
375 yield
376
377
378@contextlib.contextmanager
379def pod_network(ctx, config):
380 cluster_name = config['cluster']
381 pnet = config.get('pod_network', 'calico')
382 if pnet == 'flannel':
383 r = ctx.kubeadm[cluster_name].bootstrap_remote.run(
384 args=[
385 'curl',
386 'https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml',
387 ],
388 stdout=BytesIO(),
389 )
390 assert r.exitstatus == 0
391 flannel = list(yaml.load_all(r.stdout.getvalue(), Loader=yaml.FullLoader))
392 for o in flannel:
393 if o.get('data', {}).get('net-conf.json'):
394 log.info(f'Updating {o}')
395 o['data']['net-conf.json'] = o['data']['net-conf.json'].replace(
396 '10.244.0.0/16',
397 str(ctx.kubeadm[cluster_name].pod_subnet)
398 )
399 log.info(f'Now {o}')
400 flannel_yaml = yaml.dump_all(flannel)
401 log.debug(f'Flannel:\n{flannel_yaml}')
402 _kubectl(ctx, config, ['apply', '-f', '-'], stdin=flannel_yaml)
403
404 elif pnet == 'calico':
405 _kubectl(ctx, config, [
406 'apply', '-f',
407 'https://docs.projectcalico.org/manifests/tigera-operator.yaml'
408 ])
409 cr = {
410 'apiVersion': 'operator.tigera.io/v1',
411 'kind': 'Installation',
412 'metadata': {'name': 'default'},
413 'spec': {
414 'calicoNetwork': {
415 'ipPools': [
416 {
417 'blockSize': 26,
418 'cidr': str(ctx.kubeadm[cluster_name].pod_subnet),
419 'encapsulation': 'VXLANCrossSubnet',
420 'natOutgoing': 'Enabled',
421 'nodeSelector': 'all()',
422 }
423 ]
424 }
425 }
426 }
427 _kubectl(ctx, config, ['create', '-f', '-'], stdin=yaml.dump(cr))
428
429 else:
430 raise RuntimeError(f'unrecognized pod_network {pnet}')
431
432 try:
433 yield
434
435 finally:
436 if pnet == 'flannel':
437 _kubectl(ctx, config, [
438 'delete', '-f',
439 'https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml',
440 ])
441
442 elif pnet == 'calico':
443 _kubectl(ctx, config, ['delete', 'installation', 'default'])
444 _kubectl(ctx, config, [
445 'delete', '-f',
446 'https://docs.projectcalico.org/manifests/tigera-operator.yaml'
447 ])
448
449
450@contextlib.contextmanager
451def setup_pvs(ctx, config):
452 """
453 Create PVs for all scratch LVs and set up a trivial provisioner
454 """
455 log.info('Scanning for scratch devices')
456 crs = []
457 for remote in ctx.cluster.remotes.keys():
458 ls = remote.read_file('/scratch_devs').decode('utf-8').strip().splitlines()
459 log.info(f'Scratch devices on {remote.shortname}: {ls}')
460 for dev in ls:
461 devname = dev.split('/')[-1].replace("_", "-")
462 crs.append({
463 'apiVersion': 'v1',
464 'kind': 'PersistentVolume',
465 'metadata': {'name': f'{remote.shortname}-{devname}'},
466 'spec': {
467 'volumeMode': 'Block',
468 'accessModes': ['ReadWriteOnce'],
469 'capacity': {'storage': '100Gi'}, # doesn't matter?
470 'persistentVolumeReclaimPolicy': 'Recycle',
471 'storageClassName': 'scratch',
472 'local': {'path': dev},
473 'nodeAffinity': {
474 'required': {
475 'nodeSelectorTerms': [
476 {
477 'matchExpressions': [
478 {
479 'key': 'kubernetes.io/hostname',
480 'operator': 'In',
481 'values': [remote.shortname]
482 }
483 ]
484 }
485 ]
486 }
487 }
488 }
489 })
490 # overwriting first few MB is enough to make k8s happy
491 remote.run(args=[
492 'sudo', 'dd', 'if=/dev/zero', f'of={dev}', 'bs=1M', 'count=10'
493 ])
494 crs.append({
495 'kind': 'StorageClass',
496 'apiVersion': 'storage.k8s.io/v1',
497 'metadata': {'name': 'scratch'},
498 'provisioner': 'kubernetes.io/no-provisioner',
499 'volumeBindingMode': 'WaitForFirstConsumer',
500 })
501 y = yaml.dump_all(crs)
502 log.info('Creating PVs + StorageClass')
503 log.debug(y)
504 _kubectl(ctx, config, ['create', '-f', '-'], stdin=y)
505
506 yield
507
508
509@contextlib.contextmanager
510def final(ctx, config):
511 cluster_name = config['cluster']
512
513 # remove master node taint
514 _kubectl(ctx, config, [
515 'taint', 'node',
516 ctx.kubeadm[cluster_name].bootstrap_remote.shortname,
517 'node-role.kubernetes.io/master-',
518 run.Raw('||'),
519 'true',
520 ])
521
522 yield
523
524
525@contextlib.contextmanager
526def task(ctx, config):
527 if not config:
528 config = {}
529 assert isinstance(config, dict), \
530 "task only supports a dictionary for configuration"
531
532 log.info('Kubeadm start')
533
534 overrides = ctx.config.get('overrides', {})
535 teuthology.deep_merge(config, overrides.get('kubeadm', {}))
536 log.info('Config: ' + str(config))
537
538 # set up cluster context
539 if not hasattr(ctx, 'kubeadm'):
540 ctx.kubeadm = {}
541 if 'cluster' not in config:
542 config['cluster'] = 'kubeadm'
543 cluster_name = config['cluster']
544 if cluster_name not in ctx.kubeadm:
545 ctx.kubeadm[cluster_name] = argparse.Namespace()
546
547 with contextutil.nested(
548 lambda: preflight(ctx, config),
549 lambda: allocate_pod_subnet(ctx, config),
550 lambda: kubeadm_install(ctx, config),
551 lambda: kubeadm_init_join(ctx, config),
552 lambda: kubectl_config(ctx, config),
553 lambda: pod_network(ctx, config),
554 lambda: setup_pvs(ctx, config),
555 lambda: final(ctx, config),
556 ):
557 try:
558 log.info('Kubeadm complete, yielding')
559 yield
560
561 finally:
562 log.info('Tearing down kubeadm')