]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/kubeadm.py
bump version to 16.2.6-pve2
[ceph.git] / ceph / qa / tasks / kubeadm.py
CommitLineData
b3b6e05e
TL
1"""
2Kubernetes cluster task, deployed via kubeadm
3"""
4import argparse
5import contextlib
6import ipaddress
7import logging
8import random
9import yaml
10from io import BytesIO
11
12from teuthology import misc as teuthology
13from teuthology import contextutil
14from teuthology.config import config as teuth_config
15from teuthology.orchestra import run
16
17log = logging.getLogger(__name__)
18
19
20def _kubectl(ctx, config, args, **kwargs):
21 cluster_name = config['cluster']
22 ctx.kubeadm[cluster_name].bootstrap_remote.run(
23 args=['kubectl'] + args,
24 **kwargs,
25 )
26
27
28def kubectl(ctx, config):
29 if isinstance(config, str):
30 config = [config]
31 assert isinstance(config, list)
32 for c in config:
33 if isinstance(c, str):
34 _kubectl(ctx, config, c.split(' '))
35 else:
36 _kubectl(ctx, config, c)
37
38
39@contextlib.contextmanager
40def preflight(ctx, config):
41 run.wait(
42 ctx.cluster.run(
43 args=[
44 'sudo', 'modprobe', 'br_netfilter',
45 run.Raw('&&'),
46 'sudo', 'sysctl', 'net.bridge.bridge-nf-call-ip6tables=1',
47 run.Raw('&&'),
48 'sudo', 'sysctl', 'net.bridge.bridge-nf-call-iptables=1',
49 run.Raw('&&'),
50 'sudo', 'sysctl', 'net.ipv4.ip_forward=1',
51 run.Raw('&&'),
52 'sudo', 'swapoff', '-a',
53 ],
54 wait=False,
55 )
56 )
57 yield
58
59
60@contextlib.contextmanager
61def kubeadm_install(ctx, config):
62 version = config.get('version', '1.21')
63
64 os_type = teuthology.get_distro(ctx)
65 os_version = teuthology.get_distro_version(ctx)
66
67 try:
68 if os_type in ['centos', 'rhel']:
69 os = f"CentOS_{os_version.split('.')[0]}"
70 log.info('Installing cri-o')
71 run.wait(
72 ctx.cluster.run(
73 args=[
74 'sudo',
75 'curl', '-L', '-o',
76 '/etc/yum.repos.d/devel:kubic:libcontainers:stable.repo',
77 f'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/{os}/devel:kubic:libcontainers:stable.repo',
78 run.Raw('&&'),
79 'sudo',
80 'curl', '-L', '-o',
81 f'/etc/yum.repos.d/devel:kubic:libcontainers:stable:cri-o:{version}.repo',
82 f'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/{version}/{os}/devel:kubic:libcontainers:stable:cri-o:{version}.repo',
83 run.Raw('&&'),
84 'sudo', 'dnf', 'install', '-y', 'cri-o',
85 ],
86 wait=False,
87 )
88 )
89
90 log.info('Installing kube{adm,ctl,let}')
91 repo = """[kubernetes]
92name=Kubernetes
93baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-$basearch
94enabled=1
95gpgcheck=1
96repo_gpgcheck=1
97gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
98"""
99 for remote in ctx.cluster.remotes.keys():
100 remote.write_file(
101 '/etc/yum.repos.d/kubernetes.repo',
102 repo,
103 sudo=True,
104 )
105 run.wait(
106 ctx.cluster.run(
107 args=[
108 'sudo', 'dnf', 'install', '-y',
109 'kubelet', 'kubeadm', 'kubectl',
110 'iproute-tc', 'bridge-utils',
111 ],
112 wait=False,
113 )
114 )
115
116 # fix cni config
117 for remote in ctx.cluster.remotes.keys():
118 conf = """# from https://github.com/cri-o/cri-o/blob/master/tutorials/kubernetes.md#flannel-network
119{
120 "name": "crio",
121 "type": "flannel"
122}
123"""
124 remote.write_file('/etc/cni/net.d/10-crio-flannel.conf', conf, sudo=True)
125 remote.run(args=[
126 'sudo', 'rm', '-f',
127 '/etc/cni/net.d/87-podman-bridge.conflist',
128 '/etc/cni/net.d/100-crio-bridge.conf',
129 ])
130
131 # start crio
132 run.wait(
133 ctx.cluster.run(
134 args=[
135 'sudo', 'systemctl', 'daemon-reload',
136 run.Raw('&&'),
137 'sudo', 'systemctl', 'enable', 'crio', '--now',
138 ],
139 wait=False,
140 )
141 )
142
143 elif os_type == 'ubuntu':
144 os = f"xUbuntu_{os_version}"
145 log.info('Installing kube{adm,ctl,let}')
146 run.wait(
147 ctx.cluster.run(
148 args=[
149 'sudo', 'apt', 'update',
150 run.Raw('&&'),
151 'sudo', 'apt', 'install', '-y',
152 'apt-transport-https', 'ca-certificates', 'curl',
153 run.Raw('&&'),
154 'sudo', 'curl', '-fsSLo',
155 '/usr/share/keyrings/kubernetes-archive-keyring.gpg',
156 'https://packages.cloud.google.com/apt/doc/apt-key.gpg',
157 run.Raw('&&'),
158 'echo', 'deb [signed-by=/usr/share/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main',
159 run.Raw('|'),
160 'sudo', 'tee', '/etc/apt/sources.list.d/kubernetes.list',
161 run.Raw('&&'),
162 'sudo', 'apt', 'update',
163 run.Raw('&&'),
164 'sudo', 'apt', 'install', '-y',
165 'kubelet', 'kubeadm', 'kubectl',
166 'bridge-utils',
167 ],
168 wait=False,
169 )
170 )
171
172 else:
173 raise RuntimeError(f'unsupported distro {os_type} for cri-o')
174
175 run.wait(
176 ctx.cluster.run(
177 args=[
178 'sudo', 'systemctl', 'enable', '--now', 'kubelet',
179 run.Raw('&&'),
180 'sudo', 'kubeadm', 'config', 'images', 'pull',
181 ],
182 wait=False,
183 )
184 )
185
186 yield
187
188 finally:
189 if config.get('uninstall', True):
190 log.info('Uninstalling kube{adm,let,ctl}')
191 if os_type in ['centos', 'rhel']:
192 run.wait(
193 ctx.cluster.run(
194 args=[
195 'sudo', 'rm', '-f',
196 '/etc/yum.repos.d/kubernetes.repo',
197 run.Raw('&&'),
198 'sudo', 'dnf', 'remove', '-y',
199 'kubeadm', 'kubelet', 'kubectl', 'cri-o',
200 ],
201 wait=False
202 )
203 )
204 elif os_type == 'ubuntu' and False:
205 run.wait(
206 ctx.cluster.run(
207 args=[
208 'sudo', 'rm', '-f',
209 '/etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list',
210 f'/etc/apt/sources.list.d/devel:kubic:libcontainers:stable:cri-o:{version}.list',
211 '/etc/apt/trusted.gpg.d/libcontainers-cri-o.gpg',
212 run.Raw('&&'),
213 'sudo', 'apt', 'remove', '-y',
214 'kkubeadm', 'kubelet', 'kubectl', 'cri-o', 'cri-o-runc',
215 ],
216 wait=False,
217 )
218 )
219
220
221@contextlib.contextmanager
222def kubeadm_init_join(ctx, config):
223 cluster_name = config['cluster']
224
225 bootstrap_remote = None
226 remotes = {} # remote -> ip
227 for remote, roles in ctx.cluster.remotes.items():
228 for role in roles:
229 if role.startswith('host.'):
230 if not bootstrap_remote:
231 bootstrap_remote = remote
232 if remote not in remotes:
233 remotes[remote] = remote.ssh.get_transport().getpeername()[0]
234 if not bootstrap_remote:
235 raise RuntimeError('must define at least one host.something role')
236 ctx.kubeadm[cluster_name].bootstrap_remote = bootstrap_remote
237 ctx.kubeadm[cluster_name].remotes = remotes
238 ctx.kubeadm[cluster_name].token = 'abcdef.' + ''.join([
239 random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for _ in range(16)
240 ])
241 log.info(f'Token: {ctx.kubeadm[cluster_name].token}')
242 log.info(f'Remotes: {ctx.kubeadm[cluster_name].remotes}')
243
244 try:
245 # init
246 cmd = [
247 'sudo', 'kubeadm', 'init',
248 '--node-name', ctx.kubeadm[cluster_name].bootstrap_remote.shortname,
249 '--token', ctx.kubeadm[cluster_name].token,
250 '--pod-network-cidr', str(ctx.kubeadm[cluster_name].pod_subnet),
251 ]
252 bootstrap_remote.run(args=cmd)
253
254 # join additional nodes
255 joins = []
256 for remote, ip in ctx.kubeadm[cluster_name].remotes.items():
257 if remote == bootstrap_remote:
258 continue
259 cmd = [
260 'sudo', 'kubeadm', 'join',
261 ctx.kubeadm[cluster_name].remotes[ctx.kubeadm[cluster_name].bootstrap_remote] + ':6443',
262 '--node-name', remote.shortname,
263 '--token', ctx.kubeadm[cluster_name].token,
264 '--discovery-token-unsafe-skip-ca-verification',
265 ]
266 joins.append(remote.run(args=cmd, wait=False))
267 run.wait(joins)
268 yield
269
270 except Exception as e:
271 log.exception(e)
272 raise
273
274 finally:
275 log.info('Cleaning up node')
276 run.wait(
277 ctx.cluster.run(
278 args=['sudo', 'kubeadm', 'reset', 'cleanup-node', '-f'],
279 wait=False,
280 )
281 )
282
283
284@contextlib.contextmanager
285def kubectl_config(ctx, config):
286 cluster_name = config['cluster']
287 bootstrap_remote = ctx.kubeadm[cluster_name].bootstrap_remote
288
289 ctx.kubeadm[cluster_name].admin_conf = \
290 bootstrap_remote.read_file('/etc/kubernetes/admin.conf', sudo=True)
291
292 log.info('Setting up kubectl')
293 try:
294 ctx.cluster.run(args=[
295 'mkdir', '-p', '.kube',
296 run.Raw('&&'),
297 'sudo', 'mkdir', '-p', '/root/.kube',
298 ])
299 for remote in ctx.kubeadm[cluster_name].remotes.keys():
300 remote.write_file('.kube/config', ctx.kubeadm[cluster_name].admin_conf)
301 remote.sudo_write_file('/root/.kube/config',
302 ctx.kubeadm[cluster_name].admin_conf)
303 yield
304
305 except Exception as e:
306 log.exception(e)
307 raise
308
309 finally:
310 log.info('Deconfiguring kubectl')
311 ctx.cluster.run(args=[
312 'rm', '-rf', '.kube',
313 run.Raw('&&'),
314 'sudo', 'rm', '-rf', '/root/.kube',
315 ])
316
317
318def map_vnet(mip):
319 for mapping in teuth_config.get('vnet', []):
320 mnet = ipaddress.ip_network(mapping['machine_subnet'])
321 vnet = ipaddress.ip_network(mapping['virtual_subnet'])
322 if vnet.prefixlen >= mnet.prefixlen:
323 log.error(f"virtual_subnet {vnet} prefix >= machine_subnet {mnet} prefix")
324 return None
325 if mip in mnet:
326 pos = list(mnet.hosts()).index(mip)
327 log.info(f"{mip} is in {mnet} at pos {pos}")
328 sub = list(vnet.subnets(32 - mnet.prefixlen))[pos]
329 return sub
330 return None
331
332
333@contextlib.contextmanager
334def allocate_pod_subnet(ctx, config):
335 """
336 Allocate a private subnet that will not collide with other test machines/clusters
337 """
338 cluster_name = config['cluster']
339 assert cluster_name == 'kubeadm', 'multiple subnets not yet implemented'
340
341 log.info('Identifying pod subnet')
342 remote = list(ctx.cluster.remotes.keys())[0]
343 ip = remote.ssh.get_transport().getpeername()[0]
344 mip = ipaddress.ip_address(ip)
345 vnet = map_vnet(mip)
346 assert vnet
347 log.info(f'Pod subnet: {vnet}')
348 ctx.kubeadm[cluster_name].pod_subnet = vnet
349 yield
350
351
352@contextlib.contextmanager
353def pod_network(ctx, config):
354 cluster_name = config['cluster']
355 pnet = config.get('pod_network', 'calico')
356 if pnet == 'flannel':
357 r = ctx.kubeadm[cluster_name].bootstrap_remote.run(
358 args=[
359 'curl',
360 'https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml',
361 ],
362 stdout=BytesIO(),
363 )
364 assert r.exitstatus == 0
365 flannel = list(yaml.load_all(r.stdout.getvalue(), Loader=yaml.FullLoader))
366 for o in flannel:
367 if o.get('data', {}).get('net-conf.json'):
368 log.info(f'Updating {o}')
369 o['data']['net-conf.json'] = o['data']['net-conf.json'].replace(
370 '10.244.0.0/16',
371 str(ctx.kubeadm[cluster_name].pod_subnet)
372 )
373 log.info(f'Now {o}')
374 flannel_yaml = yaml.dump_all(flannel)
375 log.debug(f'Flannel:\n{flannel_yaml}')
376 _kubectl(ctx, config, ['apply', '-f', '-'], stdin=flannel_yaml)
377
378 elif pnet == 'calico':
379 _kubectl(ctx, config, [
380 'apply', '-f',
381 'https://docs.projectcalico.org/manifests/tigera-operator.yaml'
382 ])
383 cr = {
384 'apiVersion': 'operator.tigera.io/v1',
385 'kind': 'Installation',
386 'metadata': {'name': 'default'},
387 'spec': {
388 'calicoNetwork': {
389 'ipPools': [
390 {
391 'blockSize': 26,
392 'cidr': str(ctx.kubeadm[cluster_name].pod_subnet),
393 'encapsulation': 'VXLANCrossSubnet',
394 'natOutgoing': 'Enabled',
395 'nodeSelector': 'all()',
396 }
397 ]
398 }
399 }
400 }
401 _kubectl(ctx, config, ['create', '-f', '-'], stdin=yaml.dump(cr))
402
403 else:
404 raise RuntimeError(f'unrecognized pod_network {pnet}')
405
406 try:
407 yield
408
409 finally:
410 if pnet == 'flannel':
411 _kubectl(ctx, config, [
412 'delete', '-f',
413 'https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml',
414 ])
415
416 elif pnet == 'calico':
417 _kubectl(ctx, config, ['delete', 'installation', 'default'])
418 _kubectl(ctx, config, [
419 'delete', '-f',
420 'https://docs.projectcalico.org/manifests/tigera-operator.yaml'
421 ])
422
423
424@contextlib.contextmanager
425def setup_pvs(ctx, config):
426 """
427 Create PVs for all scratch LVs and set up a trivial provisioner
428 """
429 log.info('Scanning for scratch devices')
430 crs = []
431 for remote in ctx.cluster.remotes.keys():
432 ls = remote.read_file('/scratch_devs').decode('utf-8').strip().splitlines()
433 log.info(f'Scratch devices on {remote.shortname}: {ls}')
434 for dev in ls:
435 devname = dev.split('/')[-1].replace("_", "-")
436 crs.append({
437 'apiVersion': 'v1',
438 'kind': 'PersistentVolume',
439 'metadata': {'name': f'{remote.shortname}-{devname}'},
440 'spec': {
441 'volumeMode': 'Block',
442 'accessModes': ['ReadWriteOnce'],
443 'capacity': {'storage': '100Gi'}, # doesn't matter?
444 'persistentVolumeReclaimPolicy': 'Recycle',
445 'storageClassName': 'scratch',
446 'local': {'path': dev},
447 'nodeAffinity': {
448 'required': {
449 'nodeSelectorTerms': [
450 {
451 'matchExpressions': [
452 {
453 'key': 'kubernetes.io/hostname',
454 'operator': 'In',
455 'values': [remote.shortname]
456 }
457 ]
458 }
459 ]
460 }
461 }
462 }
463 })
464 # overwriting first few MB is enough to make k8s happy
465 remote.run(args=[
466 'sudo', 'dd', 'if=/dev/zero', f'of={dev}', 'bs=1M', 'count=10'
467 ])
468 crs.append({
469 'kind': 'StorageClass',
470 'apiVersion': 'storage.k8s.io/v1',
471 'metadata': {'name': 'scratch'},
472 'provisioner': 'kubernetes.io/no-provisioner',
473 'volumeBindingMode': 'WaitForFirstConsumer',
474 })
475 y = yaml.dump_all(crs)
476 log.info('Creating PVs + StorageClass')
477 log.debug(y)
478 _kubectl(ctx, config, ['create', '-f', '-'], stdin=y)
479
480 yield
481
482
483@contextlib.contextmanager
484def final(ctx, config):
485 cluster_name = config['cluster']
486
487 # remove master node taint
488 _kubectl(ctx, config, [
489 'taint', 'node',
490 ctx.kubeadm[cluster_name].bootstrap_remote.shortname,
491 'node-role.kubernetes.io/master-',
492 run.Raw('||'),
493 'true',
494 ])
495
496 yield
497
498
499@contextlib.contextmanager
500def task(ctx, config):
501 if not config:
502 config = {}
503 assert isinstance(config, dict), \
504 "task only supports a dictionary for configuration"
505
506 log.info('Kubeadm start')
507
508 overrides = ctx.config.get('overrides', {})
509 teuthology.deep_merge(config, overrides.get('kubeadm', {}))
510 log.info('Config: ' + str(config))
511
512 # set up cluster context
513 if not hasattr(ctx, 'kubeadm'):
514 ctx.kubeadm = {}
515 if 'cluster' not in config:
516 config['cluster'] = 'kubeadm'
517 cluster_name = config['cluster']
518 if cluster_name not in ctx.kubeadm:
519 ctx.kubeadm[cluster_name] = argparse.Namespace()
520
521 with contextutil.nested(
522 lambda: preflight(ctx, config),
523 lambda: allocate_pod_subnet(ctx, config),
524 lambda: kubeadm_install(ctx, config),
525 lambda: kubeadm_init_join(ctx, config),
526 lambda: kubectl_config(ctx, config),
527 lambda: pod_network(ctx, config),
528 lambda: setup_pvs(ctx, config),
529 lambda: final(ctx, config),
530 ):
531 try:
532 log.info('Kubeadm complete, yielding')
533 yield
534
535 finally:
536 log.info('Tearing down kubeadm')