]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | # -*- mode: YAML -*- |
2 | --- | |
3 | ||
4 | options: | |
5 | - name: osd_numa_prefer_iface | |
6 | type: bool | |
7 | level: advanced | |
8 | desc: prefer IP on network interface on same numa node as storage | |
9 | default: true | |
10 | see_also: | |
11 | - osd_numa_auto_affinity | |
12 | flags: | |
13 | - startup | |
14 | - name: osd_numa_auto_affinity | |
15 | type: bool | |
16 | level: advanced | |
17 | desc: automatically set affinity to numa node when storage and network match | |
18 | default: true | |
19 | flags: | |
20 | - startup | |
21 | - name: osd_numa_node | |
22 | type: int | |
23 | level: advanced | |
24 | desc: set affinity to a numa node (-1 for none) | |
25 | default: -1 | |
26 | see_also: | |
27 | - osd_numa_auto_affinity | |
28 | flags: | |
29 | - startup | |
1e59de90 TL |
30 | - name: set_keepcaps |
31 | type: bool | |
32 | level: advanced | |
33 | desc: set the keepcaps flag before changing UID, preserving the permitted capability set | |
34 | long_desc: When ceph switches from root to the ceph uid, all capabilities in all sets are eraseed. If | |
35 | a component that is capability aware needs a specific capability, the keepcaps flag maintains | |
36 | the permitted capability set, allowing the capabilities in the effective set to be activated as needed. | |
37 | default: false | |
38 | flags: | |
39 | - startup | |
20effc67 TL |
40 | - name: osd_smart_report_timeout |
41 | type: uint | |
42 | level: advanced | |
1e59de90 | 43 | desc: Timeout (in seconds) for smartctl to run, default is set to 5 |
20effc67 TL |
44 | default: 5 |
45 | # verify backend can support configured max object name length | |
46 | - name: osd_check_max_object_name_len_on_startup | |
47 | type: bool | |
48 | level: dev | |
49 | default: true | |
50 | with_legacy: true | |
51 | - name: osd_max_backfills | |
52 | type: uint | |
53 | level: advanced | |
54 | desc: Maximum number of concurrent local and remote backfills or recoveries per | |
55 | OSD | |
56 | long_desc: There can be osd_max_backfills local reservations AND the same remote | |
57 | reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary | |
58 | in recovery and 1 shard of another recovering PG. | |
59 | fmt_desc: The maximum number of backfills allowed to or from a single OSD. | |
60 | Note that this is applied separately for read and write operations. | |
61 | default: 1 | |
62 | flags: | |
63 | - runtime | |
64 | with_legacy: true | |
65 | # Minimum recovery priority (255 = max, smaller = lower) | |
66 | - name: osd_min_recovery_priority | |
67 | type: int | |
68 | level: advanced | |
69 | desc: Minimum priority below which recovery is not performed | |
70 | long_desc: The purpose here is to prevent the cluster from doing *any* lower priority | |
71 | work (e.g., rebalancing) below this threshold and focus solely on higher priority | |
72 | work (e.g., replicating degraded objects). | |
73 | default: 0 | |
74 | with_legacy: true | |
75 | - name: osd_backfill_retry_interval | |
76 | type: float | |
77 | level: advanced | |
78 | desc: how frequently to retry backfill reservations after being denied (e.g., due | |
79 | to a full OSD) | |
80 | fmt_desc: The number of seconds to wait before retrying backfill requests. | |
81 | default: 30 | |
82 | with_legacy: true | |
83 | - name: osd_recovery_retry_interval | |
84 | type: float | |
85 | level: advanced | |
86 | desc: how frequently to retry recovery reservations after being denied (e.g., due | |
87 | to a full OSD) | |
88 | default: 30 | |
89 | with_legacy: true | |
90 | - name: osd_recovery_sleep | |
91 | type: float | |
92 | level: advanced | |
1e59de90 TL |
93 | desc: Time in seconds to sleep before next recovery or backfill op. This setting |
94 | overrides _ssd, _hdd, and _hybrid if non-zero. | |
20effc67 TL |
95 | fmt_desc: Time in seconds to sleep before the next recovery or backfill op. |
96 | Increasing this value will slow down recovery operation while | |
97 | client operations will be less impacted. | |
98 | default: 0 | |
99 | flags: | |
100 | - runtime | |
101 | with_legacy: true | |
102 | - name: osd_recovery_sleep_hdd | |
103 | type: float | |
104 | level: advanced | |
105 | desc: Time in seconds to sleep before next recovery or backfill op for HDDs | |
106 | fmt_desc: Time in seconds to sleep before next recovery or backfill op | |
107 | for HDDs. | |
108 | default: 0.1 | |
109 | flags: | |
110 | - runtime | |
111 | with_legacy: true | |
112 | - name: osd_recovery_sleep_ssd | |
113 | type: float | |
114 | level: advanced | |
115 | desc: Time in seconds to sleep before next recovery or backfill op for SSDs | |
116 | fmt_desc: Time in seconds to sleep before the next recovery or backfill op | |
117 | for SSDs. | |
118 | default: 0 | |
119 | see_also: | |
120 | - osd_recovery_sleep | |
121 | flags: | |
122 | - runtime | |
123 | with_legacy: true | |
124 | - name: osd_recovery_sleep_hybrid | |
125 | type: float | |
126 | level: advanced | |
127 | desc: Time in seconds to sleep before next recovery or backfill op when data is | |
128 | on HDD and journal is on SSD | |
129 | fmt_desc: Time in seconds to sleep before the next recovery or backfill op | |
130 | when OSD data is on HDD and OSD journal / WAL+DB is on SSD. | |
131 | default: 0.025 | |
132 | see_also: | |
133 | - osd_recovery_sleep | |
134 | flags: | |
135 | - runtime | |
136 | - name: osd_snap_trim_sleep | |
137 | type: float | |
138 | level: advanced | |
1e59de90 TL |
139 | desc: Time in seconds to sleep before next snap trim. This setting overrides _ssd, |
140 | _hdd, and _hybrid if non-zero. | |
20effc67 TL |
141 | fmt_desc: Time in seconds to sleep before next snap trim op. |
142 | Increasing this value will slow down snap trimming. | |
143 | This option overrides backend specific variants. | |
144 | default: 0 | |
145 | flags: | |
146 | - runtime | |
147 | with_legacy: true | |
148 | - name: osd_snap_trim_sleep_hdd | |
149 | type: float | |
150 | level: advanced | |
151 | desc: Time in seconds to sleep before next snap trim for HDDs | |
152 | default: 5 | |
153 | flags: | |
154 | - runtime | |
155 | - name: osd_snap_trim_sleep_ssd | |
156 | type: float | |
157 | level: advanced | |
158 | desc: Time in seconds to sleep before next snap trim for SSDs | |
159 | fmt_desc: Time in seconds to sleep before next snap trim op | |
160 | for SSD OSDs (including NVMe). | |
161 | default: 0 | |
162 | flags: | |
163 | - runtime | |
164 | - name: osd_snap_trim_sleep_hybrid | |
165 | type: float | |
166 | level: advanced | |
167 | desc: Time in seconds to sleep before next snap trim when data is on HDD and journal | |
168 | is on SSD | |
169 | fmt_desc: Time in seconds to sleep before next snap trim op | |
170 | when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD. | |
171 | default: 2 | |
172 | flags: | |
173 | - runtime | |
174 | - name: osd_scrub_invalid_stats | |
175 | type: bool | |
176 | level: advanced | |
177 | default: true | |
178 | with_legacy: true | |
179 | - name: osd_max_scrubs | |
180 | type: int | |
181 | level: advanced | |
182 | desc: Maximum concurrent scrubs on a single OSD | |
183 | fmt_desc: The maximum number of simultaneous scrub operations for | |
184 | a Ceph OSD Daemon. | |
185 | default: 1 | |
186 | with_legacy: true | |
187 | - name: osd_scrub_during_recovery | |
188 | type: bool | |
189 | level: advanced | |
190 | desc: Allow scrubbing when PGs on the OSD are undergoing recovery | |
191 | fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable | |
192 | scheduling new scrub (and deep--scrub) while there is active recovery. | |
193 | Already running scrubs will be continued. This might be useful to reduce | |
194 | load on busy clusters. | |
195 | default: false | |
196 | with_legacy: true | |
197 | - name: osd_repair_during_recovery | |
198 | type: bool | |
199 | level: advanced | |
200 | desc: Allow requested repairing when PGs on the OSD are undergoing recovery | |
201 | default: false | |
202 | with_legacy: true | |
203 | - name: osd_scrub_begin_hour | |
204 | type: int | |
205 | level: advanced | |
206 | desc: Restrict scrubbing to this hour of the day or later | |
207 | long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day. | |
208 | fmt_desc: This restricts scrubbing to this hour of the day or later. | |
209 | Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` | |
210 | to allow scrubbing the entire day. Along with ``osd_scrub_end_hour``, they define a time | |
211 | window, in which the scrubs can happen. | |
212 | But a scrub will be performed | |
213 | no matter whether the time window allows or not, as long as the placement | |
214 | group's scrub interval exceeds ``osd_scrub_max_interval``. | |
215 | default: 0 | |
216 | see_also: | |
217 | - osd_scrub_end_hour | |
218 | min: 0 | |
219 | max: 23 | |
220 | with_legacy: true | |
221 | - name: osd_scrub_end_hour | |
222 | type: int | |
223 | level: advanced | |
224 | desc: Restrict scrubbing to hours of the day earlier than this | |
225 | long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day. | |
226 | fmt_desc: This restricts scrubbing to the hour earlier than this. | |
227 | Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing | |
228 | for the entire day. Along with ``osd_scrub_begin_hour``, they define a time | |
229 | window, in which the scrubs can happen. But a scrub will be performed | |
230 | no matter whether the time window allows or not, as long as the placement | |
231 | group's scrub interval exceeds ``osd_scrub_max_interval``. | |
232 | default: 0 | |
233 | see_also: | |
234 | - osd_scrub_begin_hour | |
235 | min: 0 | |
236 | max: 23 | |
237 | with_legacy: true | |
238 | - name: osd_scrub_begin_week_day | |
239 | type: int | |
240 | level: advanced | |
241 | desc: Restrict scrubbing to this day of the week or later | |
242 | long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0 | |
243 | for the entire week. | |
244 | fmt_desc: This restricts scrubbing to this day of the week or later. | |
245 | 0 = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0`` | |
246 | and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week. | |
247 | Along with ``osd_scrub_end_week_day``, they define a time window in which | |
248 | scrubs can happen. But a scrub will be performed | |
249 | no matter whether the time window allows or not, when the PG's | |
250 | scrub interval exceeds ``osd_scrub_max_interval``. | |
251 | default: 0 | |
252 | see_also: | |
253 | - osd_scrub_end_week_day | |
254 | min: 0 | |
255 | max: 6 | |
256 | with_legacy: true | |
257 | - name: osd_scrub_end_week_day | |
258 | type: int | |
259 | level: advanced | |
260 | desc: Restrict scrubbing to days of the week earlier than this | |
261 | long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0 | |
262 | for the entire week. | |
263 | fmt_desc: This restricts scrubbing to days of the week earlier than this. | |
264 | 0 = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0`` | |
265 | and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week. | |
266 | Along with ``osd_scrub_begin_week_day``, they define a time | |
267 | window, in which the scrubs can happen. But a scrub will be performed | |
268 | no matter whether the time window allows or not, as long as the placement | |
269 | group's scrub interval exceeds ``osd_scrub_max_interval``. | |
270 | default: 0 | |
271 | see_also: | |
272 | - osd_scrub_begin_week_day | |
273 | min: 0 | |
274 | max: 6 | |
275 | with_legacy: true | |
276 | - name: osd_scrub_load_threshold | |
277 | type: float | |
278 | level: advanced | |
279 | desc: Allow scrubbing when system load divided by number of CPUs is below this value | |
280 | fmt_desc: The normalized maximum load. Ceph will not scrub when the system load | |
281 | (as defined by ``getloadavg() / number of online CPUs``) is higher than this number. | |
282 | Default is ``0.5``. | |
283 | default: 0.5 | |
284 | with_legacy: true | |
285 | # if load is low | |
286 | - name: osd_scrub_min_interval | |
287 | type: float | |
288 | level: advanced | |
289 | desc: Scrub each PG no more often than this interval | |
290 | fmt_desc: The minimal interval in seconds for scrubbing the Ceph OSD Daemon | |
291 | when the Ceph Storage Cluster load is low. | |
292 | default: 1_day | |
293 | see_also: | |
294 | - osd_scrub_max_interval | |
295 | with_legacy: true | |
296 | # regardless of load | |
297 | - name: osd_scrub_max_interval | |
298 | type: float | |
299 | level: advanced | |
300 | desc: Scrub each PG no less often than this interval | |
301 | fmt_desc: The maximum interval in seconds for scrubbing the Ceph OSD Daemon | |
302 | irrespective of cluster load. | |
303 | default: 7_day | |
304 | see_also: | |
305 | - osd_scrub_min_interval | |
306 | with_legacy: true | |
307 | # randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio)) | |
308 | - name: osd_scrub_interval_randomize_ratio | |
309 | type: float | |
310 | level: advanced | |
311 | desc: Ratio of scrub interval to randomly vary | |
312 | long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals | |
313 | so that they are soon uniformly distributed over the week | |
314 | fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling | |
315 | the next scrub job for a PG. The delay is a random | |
316 | value less than ``osd_scrub_min_interval`` \* | |
317 | ``osd_scrub_interval_randomized_ratio``. The default setting | |
318 | spreads scrubs throughout the allowed time | |
319 | window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``. | |
320 | default: 0.5 | |
321 | see_also: | |
322 | - osd_scrub_min_interval | |
323 | with_legacy: true | |
324 | # the probability to back off the scheduled scrub | |
325 | - name: osd_scrub_backoff_ratio | |
326 | type: float | |
327 | level: dev | |
328 | desc: Backoff ratio for scheduling scrubs | |
329 | long_desc: This is the precentage of ticks that do NOT schedule scrubs, 66% means | |
330 | that 1 out of 3 ticks will schedule scrubs | |
331 | default: 0.66 | |
332 | with_legacy: true | |
333 | - name: osd_scrub_chunk_min | |
334 | type: int | |
335 | level: advanced | |
1e59de90 | 336 | desc: Minimum number of objects to deep-scrub in a single chunk |
20effc67 TL |
337 | fmt_desc: The minimal number of object store chunks to scrub during single operation. |
338 | Ceph blocks writes to single chunk during scrub. | |
339 | default: 5 | |
340 | see_also: | |
341 | - osd_scrub_chunk_max | |
342 | with_legacy: true | |
343 | - name: osd_scrub_chunk_max | |
344 | type: int | |
345 | level: advanced | |
1e59de90 | 346 | desc: Maximum number of objects to deep-scrub in a single chunk |
20effc67 TL |
347 | fmt_desc: The maximum number of object store chunks to scrub during single operation. |
348 | default: 25 | |
349 | see_also: | |
350 | - osd_scrub_chunk_min | |
351 | with_legacy: true | |
1e59de90 TL |
352 | - name: osd_shallow_scrub_chunk_min |
353 | type: int | |
354 | level: advanced | |
355 | desc: Minimum number of objects to scrub in a single chunk | |
356 | fmt_desc: The minimum number of object store chunks to scrub during single operation. | |
357 | Not applicable to deep scrubs. | |
358 | Ceph blocks writes to single chunk during scrub. | |
359 | default: 50 | |
360 | see_also: | |
361 | - osd_shallow_scrub_chunk_max | |
362 | - osd_scrub_chunk_min | |
363 | with_legacy: true | |
364 | - name: osd_shallow_scrub_chunk_max | |
365 | type: int | |
366 | level: advanced | |
367 | desc: Maximum number of objects to scrub in a single chunk | |
368 | fmt_desc: The maximum number of object store chunks to scrub during single operation. | |
369 | Not applicable to deep scrubs. | |
370 | default: 100 | |
371 | see_also: | |
372 | - osd_shallow_scrub_chunk_min | |
373 | - osd_scrub_chunk_max | |
374 | with_legacy: true | |
20effc67 TL |
375 | # sleep between [deep]scrub ops |
376 | - name: osd_scrub_sleep | |
377 | type: float | |
378 | level: advanced | |
379 | desc: Duration to inject a delay during scrubbing | |
380 | fmt_desc: Time to sleep before scrubbing the next group of chunks. Increasing this value will slow | |
381 | down the overall rate of scrubbing so that client operations will be less impacted. | |
382 | default: 0 | |
383 | flags: | |
384 | - runtime | |
385 | with_legacy: true | |
386 | # more sleep between [deep]scrub ops | |
387 | - name: osd_scrub_extended_sleep | |
388 | type: float | |
389 | level: advanced | |
390 | desc: Duration to inject a delay during scrubbing out of scrubbing hours | |
391 | default: 0 | |
392 | see_also: | |
393 | - osd_scrub_begin_hour | |
394 | - osd_scrub_end_hour | |
395 | - osd_scrub_begin_week_day | |
396 | - osd_scrub_end_week_day | |
397 | with_legacy: true | |
398 | # whether auto-repair inconsistencies upon deep-scrubbing | |
399 | - name: osd_scrub_auto_repair | |
400 | type: bool | |
401 | level: advanced | |
402 | desc: Automatically repair damaged objects detected during scrub | |
403 | fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors | |
404 | are found by scrubs or deep-scrubs. However, if more than | |
405 | ``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed. | |
406 | default: false | |
407 | with_legacy: true | |
408 | # only auto-repair when number of errors is below this threshold | |
409 | - name: osd_scrub_auto_repair_num_errors | |
410 | type: uint | |
411 | level: advanced | |
412 | desc: Maximum number of detected errors to automatically repair | |
413 | fmt_desc: Auto repair will not occur if more than this many errors are found. | |
414 | default: 5 | |
415 | see_also: | |
416 | - osd_scrub_auto_repair | |
417 | with_legacy: true | |
418 | - name: osd_scrub_max_preemptions | |
419 | type: uint | |
420 | level: advanced | |
421 | desc: Set the maximum number of times we will preempt a deep scrub due to a client | |
422 | operation before blocking client IO to complete the scrub | |
423 | default: 5 | |
424 | min: 0 | |
425 | max: 30 | |
426 | - name: osd_deep_scrub_interval | |
427 | type: float | |
428 | level: advanced | |
429 | desc: Deep scrub each PG (i.e., verify data checksums) at least this often | |
430 | fmt_desc: The interval for "deep" scrubbing (fully reading all data). The | |
431 | ``osd_scrub_load_threshold`` does not affect this setting. | |
432 | default: 7_day | |
433 | with_legacy: true | |
434 | - name: osd_deep_scrub_randomize_ratio | |
435 | type: float | |
436 | level: advanced | |
437 | desc: Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs | |
438 | are deep) | |
439 | long_desc: This prevents a deep scrub 'stampede' by spreading deep scrubs so they | |
440 | are uniformly distributed over the week | |
441 | default: 0.15 | |
442 | with_legacy: true | |
443 | - name: osd_deep_scrub_stride | |
444 | type: size | |
445 | level: advanced | |
446 | desc: Number of bytes to read from an object at a time during deep scrub | |
447 | fmt_desc: Read size when doing a deep scrub. | |
448 | default: 512_K | |
449 | with_legacy: true | |
450 | - name: osd_deep_scrub_keys | |
451 | type: int | |
452 | level: advanced | |
453 | desc: Number of keys to read from an object at a time during deep scrub | |
454 | default: 1024 | |
455 | with_legacy: true | |
456 | # objects must be this old (seconds) before we update the whole-object digest on scrub | |
457 | - name: osd_deep_scrub_update_digest_min_age | |
458 | type: int | |
459 | level: advanced | |
460 | desc: Update overall object digest only if object was last modified longer ago than | |
461 | this | |
462 | default: 2_hr | |
463 | with_legacy: true | |
464 | - name: osd_deep_scrub_large_omap_object_key_threshold | |
465 | type: uint | |
466 | level: advanced | |
467 | desc: Warn when we encounter an object with more omap keys than this | |
468 | default: 200000 | |
469 | services: | |
470 | - osd | |
471 | - mds | |
472 | see_also: | |
473 | - osd_deep_scrub_large_omap_object_value_sum_threshold | |
474 | with_legacy: true | |
475 | - name: osd_deep_scrub_large_omap_object_value_sum_threshold | |
476 | type: size | |
477 | level: advanced | |
478 | desc: Warn when we encounter an object with more omap key bytes than this | |
479 | default: 1_G | |
480 | services: | |
481 | - osd | |
482 | see_also: | |
483 | - osd_deep_scrub_large_omap_object_key_threshold | |
484 | with_legacy: true | |
1e59de90 TL |
485 | # when scrubbing blocks on a locked object |
486 | - name: osd_blocked_scrub_grace_period | |
487 | type: int | |
488 | level: advanced | |
489 | desc: Time (seconds) before issuing a cluster-log warning | |
490 | long_desc: Waiting too long for an object in the scrubbed chunk to be unlocked. | |
491 | default: 120 | |
492 | with_legacy: true | |
493 | # timely updates to the 'pg dump' output, esp. re scrub scheduling | |
494 | - name: osd_stats_update_period_scrubbing | |
495 | type: int | |
496 | level: advanced | |
497 | desc: Stats update period (seconds) when scrubbing | |
498 | long_desc: A PG actively scrubbing (or blocked while scrubbing) publishes its | |
499 | stats (inc. scrub/block duration) every this many seconds. | |
500 | default: 15 | |
501 | with_legacy: false | |
502 | - name: osd_stats_update_period_not_scrubbing | |
503 | type: int | |
504 | level: advanced | |
505 | desc: Stats update period (seconds) when not scrubbing | |
506 | long_desc: A PG we are a primary of, publishes its | |
507 | stats (inc. scrub/block duration) every this many seconds. | |
508 | default: 120 | |
509 | with_legacy: false | |
510 | # when replicas are slow to respond to scrub resource reservations | |
511 | # Note: disable by using a very large value | |
512 | - name: osd_scrub_slow_reservation_response | |
513 | type: millisecs | |
514 | level: advanced | |
515 | desc: Duration before issuing a cluster-log warning | |
516 | long_desc: Waiting too long for a replica to respond (after at least half of the | |
517 | replicas have responded). | |
518 | default: 2200 | |
519 | min: 500 | |
520 | see_also: | |
521 | - osd_scrub_reservation_timeout | |
522 | with_legacy: false | |
523 | # when a replica does not respond to scrub resource request | |
524 | # Note: disable by using a very large value | |
525 | - name: osd_scrub_reservation_timeout | |
526 | type: millisecs | |
527 | level: advanced | |
528 | desc: Duration before aborting the scrub session | |
529 | long_desc: Waiting too long for some replicas to respond to | |
530 | scrub reservation requests. | |
531 | default: 5000 | |
532 | min: 2000 | |
533 | see_also: | |
534 | - osd_scrub_slow_reservation_response | |
535 | with_legacy: false | |
20effc67 TL |
536 | # where rados plugins are stored |
537 | - name: osd_class_dir | |
538 | type: str | |
539 | level: advanced | |
540 | default: @CMAKE_INSTALL_LIBDIR@/rados-classes | |
541 | fmt_desc: The class path for RADOS class plug-ins. | |
542 | with_legacy: true | |
543 | - name: osd_open_classes_on_start | |
544 | type: bool | |
545 | level: advanced | |
546 | default: true | |
547 | with_legacy: true | |
548 | # list of object classes allowed to be loaded (allow all: *) | |
549 | - name: osd_class_load_list | |
550 | type: str | |
551 | level: advanced | |
552 | default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex | |
553 | user version cas cmpomap queue 2pc_queue fifo | |
554 | with_legacy: true | |
555 | # list of object classes with default execute perm (allow all: *) | |
556 | - name: osd_class_default_list | |
557 | type: str | |
558 | level: advanced | |
559 | default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex | |
560 | user version cas cmpomap queue 2pc_queue fifo | |
561 | with_legacy: true | |
562 | - name: osd_agent_max_ops | |
563 | type: int | |
564 | level: advanced | |
565 | desc: maximum concurrent tiering operations for tiering agent | |
566 | fmt_desc: The maximum number of simultaneous flushing ops per tiering agent | |
567 | in the high speed mode. | |
568 | default: 4 | |
569 | with_legacy: true | |
570 | - name: osd_agent_max_low_ops | |
571 | type: int | |
572 | level: advanced | |
573 | desc: maximum concurrent low-priority tiering operations for tiering agent | |
574 | fmt_desc: The maximum number of simultaneous flushing ops per tiering agent | |
575 | in the low speed mode. | |
576 | default: 2 | |
577 | with_legacy: true | |
578 | - name: osd_agent_min_evict_effort | |
579 | type: float | |
580 | level: advanced | |
581 | desc: minimum effort to expend evicting clean objects | |
582 | default: 0.1 | |
583 | min: 0 | |
584 | max: 0.99 | |
585 | with_legacy: true | |
586 | - name: osd_agent_quantize_effort | |
587 | type: float | |
588 | level: advanced | |
589 | desc: size of quantize unit for eviction effort | |
590 | default: 0.1 | |
591 | with_legacy: true | |
592 | - name: osd_agent_delay_time | |
593 | type: float | |
594 | level: advanced | |
595 | desc: how long agent should sleep if it has no work to do | |
596 | default: 5 | |
597 | with_legacy: true | |
598 | # decay atime and hist histograms after how many objects go by | |
599 | - name: osd_agent_hist_halflife | |
600 | type: int | |
601 | level: advanced | |
602 | desc: halflife of agent atime and temp histograms | |
603 | default: 1000 | |
604 | with_legacy: true | |
605 | # decay atime and hist histograms after how many objects go by | |
606 | - name: osd_agent_slop | |
607 | type: float | |
608 | level: advanced | |
609 | desc: slop factor to avoid switching tiering flush and eviction mode | |
610 | default: 0.02 | |
611 | with_legacy: true | |
612 | - name: osd_find_best_info_ignore_history_les | |
613 | type: bool | |
614 | level: dev | |
615 | desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA | |
616 | long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE | |
617 | DIRECTION OF A DEVELOPER. It makes peering ignore the last_epoch_started value | |
618 | when peering, which can allow the OSD to believe an OSD has an authoritative view | |
619 | of a PG's contents even when it is in fact old and stale, typically leading to | |
620 | data loss (by believing a stale PG is up to date). | |
621 | default: false | |
622 | with_legacy: true | |
623 | - name: osd_uuid | |
624 | type: uuid | |
625 | level: advanced | |
626 | desc: uuid label for a new OSD | |
627 | fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon. | |
628 | note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid`` | |
629 | applies to the entire cluster. | |
630 | flags: | |
631 | - create | |
632 | with_legacy: true | |
633 | - name: osd_data | |
634 | type: str | |
635 | level: advanced | |
636 | desc: path to OSD data | |
637 | fmt_desc: The path to the OSDs data. You must create the directory when | |
638 | deploying Ceph. You should mount a drive for OSD data at this | |
639 | mount point. We do not recommend changing the default. | |
640 | default: /var/lib/ceph/osd/$cluster-$id | |
641 | flags: | |
642 | - no_mon_update | |
643 | with_legacy: true | |
644 | - name: osd_journal | |
645 | type: str | |
646 | level: advanced | |
647 | desc: path to OSD journal (when FileStore backend is in use) | |
648 | fmt_desc: The path to the OSD's journal. This may be a path to a file or a | |
649 | block device (such as a partition of an SSD). If it is a file, | |
650 | you must create the directory to contain it. We recommend using a | |
651 | separate fast device when the ``osd_data`` drive is an HDD. | |
652 | default: /var/lib/ceph/osd/$cluster-$id/journal | |
653 | flags: | |
654 | - no_mon_update | |
655 | with_legacy: true | |
656 | - name: osd_journal_size | |
657 | type: size | |
658 | level: advanced | |
659 | desc: size of FileStore journal (in MiB) | |
660 | fmt_desc: The size of the journal in megabytes. | |
661 | default: 5_K | |
662 | flags: | |
663 | - create | |
664 | with_legacy: true | |
665 | - name: osd_journal_flush_on_shutdown | |
666 | type: bool | |
667 | level: advanced | |
668 | desc: flush FileStore journal contents during clean OSD shutdown | |
669 | default: true | |
670 | with_legacy: true | |
671 | - name: osd_compact_on_start | |
672 | type: bool | |
673 | level: advanced | |
674 | desc: compact OSD's object store's OMAP on start | |
675 | default: false | |
676 | # flags for specific control purpose during osd mount() process. | |
677 | # e.g., can be 1 to skip over replaying journal | |
678 | # or 2 to skip over mounting omap or 3 to skip over both. | |
679 | # This might be helpful in case the journal is totally corrupted | |
680 | # and we still want to bring the osd daemon back normally, etc. | |
681 | - name: osd_os_flags | |
682 | type: uint | |
683 | level: dev | |
684 | desc: flags to skip filestore omap or journal initialization | |
685 | default: 0 | |
686 | - name: osd_max_write_size | |
687 | type: size | |
688 | level: advanced | |
689 | desc: Maximum size of a RADOS write operation in megabytes | |
690 | long_desc: This setting prevents clients from doing very large writes to RADOS. If | |
691 | you set this to a value below what clients expect, they will receive an error | |
692 | when attempting to write to the cluster. | |
693 | fmt_desc: The maximum size of a write in megabytes. | |
694 | default: 90 | |
695 | min: 4 | |
696 | with_legacy: true | |
697 | - name: osd_max_pgls | |
698 | type: uint | |
699 | level: advanced | |
700 | desc: maximum number of results when listing objects in a pool | |
701 | fmt_desc: The maximum number of placement groups to list. A client | |
702 | requesting a large number can tie up the Ceph OSD Daemon. | |
703 | default: 1_K | |
704 | with_legacy: true | |
705 | - name: osd_client_message_size_cap | |
706 | type: size | |
707 | level: advanced | |
708 | desc: maximum memory to devote to in-flight client requests | |
709 | long_desc: If this value is exceeded, the OSD will not read any new client data | |
710 | off of the network until memory is freed. | |
711 | fmt_desc: The largest client data message allowed in memory. | |
712 | default: 500_M | |
713 | with_legacy: true | |
714 | - name: osd_client_message_cap | |
715 | type: uint | |
716 | level: advanced | |
717 | desc: maximum number of in-flight client requests | |
718 | default: 256 | |
719 | with_legacy: true | |
720 | - name: osd_crush_update_on_start | |
721 | type: bool | |
722 | level: advanced | |
723 | desc: update OSD CRUSH location on startup | |
724 | default: true | |
725 | with_legacy: true | |
726 | - name: osd_class_update_on_start | |
727 | type: bool | |
728 | level: advanced | |
729 | desc: set OSD device class on startup | |
730 | default: true | |
731 | with_legacy: true | |
732 | - name: osd_crush_initial_weight | |
733 | type: float | |
734 | level: advanced | |
735 | desc: if >= 0, initial CRUSH weight for newly created OSDs | |
736 | long_desc: If this value is negative, the size of the OSD in TiB is used. | |
737 | fmt_desc: The initial CRUSH weight for newly added OSDs. The default | |
738 | value of this option is ``the size of a newly added OSD in TB``. By default, | |
739 | the initial CRUSH weight for a newly added OSD is set to its device size in | |
740 | TB. See `Weighting Bucket Items`_ for details. | |
741 | default: -1 | |
742 | with_legacy: true | |
743 | # Allows the "peered" state for recovery and backfill below min_size | |
744 | - name: osd_allow_recovery_below_min_size | |
745 | type: bool | |
746 | level: dev | |
747 | desc: allow replicated pools to recover with < min_size active members | |
748 | default: true | |
749 | services: | |
750 | - osd | |
751 | with_legacy: true | |
752 | # cap on # of inc maps we send to peers, clients | |
753 | - name: osd_map_share_max_epochs | |
754 | type: int | |
755 | level: advanced | |
756 | default: 40 | |
757 | with_legacy: true | |
758 | - name: osd_map_cache_size | |
759 | type: int | |
760 | level: advanced | |
761 | default: 50 | |
762 | fmt_desc: The number of OSD maps to keep cached. | |
763 | with_legacy: true | |
764 | - name: osd_pg_epoch_max_lag_factor | |
765 | type: float | |
766 | level: advanced | |
767 | desc: Max multiple of the map cache that PGs can lag before we throttle map injest | |
768 | default: 2 | |
769 | see_also: | |
770 | - osd_map_cache_size | |
771 | - name: osd_inject_bad_map_crc_probability | |
772 | type: float | |
773 | level: dev | |
774 | default: 0 | |
775 | with_legacy: true | |
776 | - name: osd_inject_failure_on_pg_removal | |
777 | type: bool | |
778 | level: dev | |
779 | default: false | |
780 | with_legacy: true | |
781 | # shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds | |
782 | - name: osd_max_markdown_period | |
783 | type: int | |
784 | level: advanced | |
785 | default: 10_min | |
786 | with_legacy: true | |
787 | - name: osd_max_markdown_count | |
788 | type: int | |
789 | level: advanced | |
790 | default: 5 | |
791 | with_legacy: true | |
792 | - name: osd_op_thread_timeout | |
793 | type: int | |
794 | level: advanced | |
795 | default: 15 | |
796 | fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds. | |
797 | with_legacy: true | |
798 | - name: osd_op_thread_suicide_timeout | |
799 | type: int | |
800 | level: advanced | |
801 | default: 150 | |
802 | with_legacy: true | |
803 | - name: osd_op_pq_max_tokens_per_priority | |
804 | type: uint | |
805 | level: advanced | |
806 | default: 4_M | |
807 | with_legacy: true | |
808 | - name: osd_op_pq_min_cost | |
809 | type: size | |
810 | level: advanced | |
811 | default: 64_K | |
812 | with_legacy: true | |
813 | # preserve clone_overlap during recovery/migration | |
814 | - name: osd_recover_clone_overlap | |
815 | type: bool | |
816 | level: advanced | |
817 | default: true | |
818 | fmt_desc: Preserves clone overlap during recovery. Should always be set | |
819 | to ``true``. | |
820 | with_legacy: true | |
821 | - name: osd_num_cache_shards | |
822 | type: size | |
823 | level: advanced | |
824 | desc: The number of cache shards to use in the object store. | |
825 | default: 32 | |
826 | flags: | |
827 | - startup | |
828 | - name: osd_aggregated_slow_ops_logging | |
829 | type: bool | |
830 | level: advanced | |
831 | desc: Allow OSD daemon to send an aggregated slow ops to the cluster log | |
832 | fmt_desc: If set to ``true``, the OSD daemon will send slow ops information in | |
833 | an aggregated format to the cluster log else sends every slow op to the | |
834 | cluster log. | |
835 | default: true | |
836 | with_legacy: true | |
837 | - name: osd_op_num_threads_per_shard | |
838 | type: int | |
839 | level: advanced | |
840 | default: 0 | |
841 | flags: | |
842 | - startup | |
843 | with_legacy: true | |
844 | - name: osd_op_num_threads_per_shard_hdd | |
845 | type: int | |
846 | level: advanced | |
847 | default: 1 | |
848 | see_also: | |
849 | - osd_op_num_threads_per_shard | |
850 | flags: | |
851 | - startup | |
852 | with_legacy: true | |
853 | - name: osd_op_num_threads_per_shard_ssd | |
854 | type: int | |
855 | level: advanced | |
856 | default: 2 | |
857 | see_also: | |
858 | - osd_op_num_threads_per_shard | |
859 | flags: | |
860 | - startup | |
861 | with_legacy: true | |
862 | - name: osd_op_num_shards | |
863 | type: int | |
864 | level: advanced | |
865 | fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue. | |
866 | PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if | |
867 | non-zero. | |
868 | default: 0 | |
869 | flags: | |
870 | - startup | |
871 | with_legacy: true | |
872 | - name: osd_op_num_shards_hdd | |
873 | type: int | |
874 | level: advanced | |
875 | fmt_desc: the number of shards allocated for a given OSD (for rotational media). | |
876 | default: 5 | |
877 | see_also: | |
878 | - osd_op_num_shards | |
879 | flags: | |
880 | - startup | |
881 | with_legacy: true | |
882 | - name: osd_op_num_shards_ssd | |
883 | type: int | |
884 | level: advanced | |
885 | fmt_desc: the number of shards allocated for a given OSD (for solid state media). | |
886 | default: 8 | |
887 | see_also: | |
888 | - osd_op_num_shards | |
889 | flags: | |
890 | - startup | |
891 | with_legacy: true | |
892 | - name: osd_skip_data_digest | |
893 | type: bool | |
894 | level: dev | |
895 | desc: Do not store full-object checksums if the backend (bluestore) does its own | |
896 | checksums. Only usable with all BlueStore OSDs. | |
897 | default: false | |
898 | # PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default), | |
899 | # mclock_opclass, mclock_client, or debug_random. "mclock_opclass" | |
900 | # and "mclock_client" are based on the mClock/dmClock algorithm | |
901 | # (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the | |
902 | # class the operation belongs to. "mclock_client" does the same but | |
903 | # also works to ienforce fairness between clients. "debug_random" | |
904 | # chooses among all four with equal probability. | |
905 | - name: osd_op_queue | |
906 | type: str | |
907 | level: advanced | |
908 | desc: which operation priority queue algorithm to use | |
909 | long_desc: which operation priority queue algorithm to use | |
910 | fmt_desc: This sets the type of queue to be used for prioritizing ops | |
911 | within each OSD. Both queues feature a strict sub-queue which is | |
912 | dequeued before the normal queue. The normal queue is different | |
913 | between implementations. The WeightedPriorityQueue (``wpq``) | |
914 | dequeues operations in relation to their priorities to prevent | |
915 | starvation of any queue. WPQ should help in cases where a few OSDs | |
916 | are more overloaded than others. The mClockQueue | |
917 | (``mclock_scheduler``) prioritizes operations based on which class | |
918 | they belong to (recovery, scrub, snaptrim, client op, osd subop). | |
919 | See `QoS Based on mClock`_. Requires a restart. | |
920 | default: mclock_scheduler | |
921 | see_also: | |
922 | - osd_op_queue_cut_off | |
923 | enum_values: | |
924 | - wpq | |
925 | - mclock_scheduler | |
926 | - debug_random | |
927 | with_legacy: true | |
928 | # Min priority to go to strict queue. (low, high) | |
929 | - name: osd_op_queue_cut_off | |
930 | type: str | |
931 | level: advanced | |
932 | desc: the threshold between high priority ops and low priority ops | |
933 | long_desc: the threshold between high priority ops that use strict priority ordering | |
934 | and low priority ops that use a fairness algorithm that may or may not incorporate | |
935 | priority | |
936 | fmt_desc: This selects which priority ops will be sent to the strict | |
937 | queue verses the normal queue. The ``low`` setting sends all | |
938 | replication ops and higher to the strict queue, while the ``high`` | |
939 | option sends only replication acknowledgment ops and higher to | |
940 | the strict queue. Setting this to ``high`` should help when a few | |
941 | OSDs in the cluster are very busy especially when combined with | |
942 | ``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy | |
943 | handling replication traffic could starve primary client traffic | |
944 | on these OSDs without these settings. Requires a restart. | |
945 | default: high | |
946 | see_also: | |
947 | - osd_op_queue | |
948 | enum_values: | |
949 | - low | |
950 | - high | |
951 | - debug_random | |
952 | with_legacy: true | |
953 | - name: osd_mclock_scheduler_client_res | |
1e59de90 | 954 | type: float |
20effc67 | 955 | level: advanced |
1e59de90 TL |
956 | desc: IO proportion reserved for each client (default). The default value |
957 | of 0 specifies the lowest possible reservation. Any value greater than | |
958 | 0 and up to 1.0 specifies the minimum IO proportion to reserve for each | |
959 | client in terms of a fraction of the OSD's maximum IOPS capacity. | |
20effc67 TL |
960 | long_desc: Only considered for osd_op_queue = mclock_scheduler |
961 | fmt_desc: IO proportion reserved for each client (default). | |
1e59de90 TL |
962 | default: 0 |
963 | min: 0 | |
964 | max: 1.0 | |
20effc67 TL |
965 | see_also: |
966 | - osd_op_queue | |
967 | - name: osd_mclock_scheduler_client_wgt | |
968 | type: uint | |
969 | level: advanced | |
970 | desc: IO share for each client (default) over reservation | |
971 | long_desc: Only considered for osd_op_queue = mclock_scheduler | |
972 | fmt_desc: IO share for each client (default) over reservation. | |
973 | default: 1 | |
974 | see_also: | |
975 | - osd_op_queue | |
976 | - name: osd_mclock_scheduler_client_lim | |
1e59de90 | 977 | type: float |
20effc67 | 978 | level: advanced |
1e59de90 TL |
979 | desc: IO limit for each client (default) over reservation. The default |
980 | value of 0 specifies no limit enforcement, which means each client can | |
981 | use the maximum possible IOPS capacity of the OSD. Any value greater | |
982 | than 0 and up to 1.0 specifies the upper IO limit over reservation | |
983 | that each client receives in terms of a fraction of the OSD's | |
984 | maximum IOPS capacity. | |
20effc67 TL |
985 | long_desc: Only considered for osd_op_queue = mclock_scheduler |
986 | fmt_desc: IO limit for each client (default) over reservation. | |
1e59de90 TL |
987 | default: 0 |
988 | min: 0 | |
989 | max: 1.0 | |
20effc67 TL |
990 | see_also: |
991 | - osd_op_queue | |
992 | - name: osd_mclock_scheduler_background_recovery_res | |
1e59de90 | 993 | type: float |
20effc67 | 994 | level: advanced |
1e59de90 TL |
995 | desc: IO proportion reserved for background recovery (default). The |
996 | default value of 0 specifies the lowest possible reservation. Any value | |
997 | greater than 0 and up to 1.0 specifies the minimum IO proportion to | |
998 | reserve for background recovery operations in terms of a fraction of | |
999 | the OSD's maximum IOPS capacity. | |
20effc67 TL |
1000 | long_desc: Only considered for osd_op_queue = mclock_scheduler |
1001 | fmt_desc: IO proportion reserved for background recovery (default). | |
1e59de90 TL |
1002 | default: 0 |
1003 | min: 0 | |
1004 | max: 1.0 | |
20effc67 TL |
1005 | see_also: |
1006 | - osd_op_queue | |
1007 | - name: osd_mclock_scheduler_background_recovery_wgt | |
1008 | type: uint | |
1009 | level: advanced | |
1010 | desc: IO share for each background recovery over reservation | |
1011 | long_desc: Only considered for osd_op_queue = mclock_scheduler | |
1012 | fmt_desc: IO share for each background recovery over reservation. | |
1013 | default: 1 | |
1014 | see_also: | |
1015 | - osd_op_queue | |
1016 | - name: osd_mclock_scheduler_background_recovery_lim | |
1e59de90 | 1017 | type: float |
20effc67 | 1018 | level: advanced |
1e59de90 TL |
1019 | desc: IO limit for background recovery over reservation. The default |
1020 | value of 0 specifies no limit enforcement, which means background | |
1021 | recovery operation can use the maximum possible IOPS capacity of the | |
1022 | OSD. Any value greater than 0 and up to 1.0 specifies the upper IO | |
1023 | limit over reservation that background recovery operation receives in | |
1024 | terms of a fraction of the OSD's maximum IOPS capacity. | |
20effc67 TL |
1025 | long_desc: Only considered for osd_op_queue = mclock_scheduler |
1026 | fmt_desc: IO limit for background recovery over reservation. | |
1e59de90 TL |
1027 | default: 0 |
1028 | min: 0 | |
1029 | max: 1.0 | |
20effc67 TL |
1030 | see_also: |
1031 | - osd_op_queue | |
1032 | - name: osd_mclock_scheduler_background_best_effort_res | |
1e59de90 | 1033 | type: float |
20effc67 | 1034 | level: advanced |
1e59de90 TL |
1035 | desc: IO proportion reserved for background best_effort (default). The |
1036 | default value of 0 specifies the lowest possible reservation. Any value | |
1037 | greater than 0 and up to 1.0 specifies the minimum IO proportion to | |
1038 | reserve for background best_effort operations in terms of a fraction | |
1039 | of the OSD's maximum IOPS capacity. | |
20effc67 TL |
1040 | long_desc: Only considered for osd_op_queue = mclock_scheduler |
1041 | fmt_desc: IO proportion reserved for background best_effort (default). | |
1e59de90 TL |
1042 | default: 0 |
1043 | min: 0 | |
1044 | max: 1.0 | |
20effc67 TL |
1045 | see_also: |
1046 | - osd_op_queue | |
1047 | - name: osd_mclock_scheduler_background_best_effort_wgt | |
1048 | type: uint | |
1049 | level: advanced | |
1050 | desc: IO share for each background best_effort over reservation | |
1051 | long_desc: Only considered for osd_op_queue = mclock_scheduler | |
1052 | fmt_desc: IO share for each background best_effort over reservation. | |
1053 | default: 1 | |
1054 | see_also: | |
1055 | - osd_op_queue | |
1056 | - name: osd_mclock_scheduler_background_best_effort_lim | |
1e59de90 | 1057 | type: float |
20effc67 | 1058 | level: advanced |
1e59de90 TL |
1059 | desc: IO limit for background best_effort over reservation. The default |
1060 | value of 0 specifies no limit enforcement, which means background | |
1061 | best_effort operation can use the maximum possible IOPS capacity of the | |
1062 | OSD. Any value greater than 0 and up to 1.0 specifies the upper IO | |
1063 | limit over reservation that background best_effort operation receives | |
1064 | in terms of a fraction of the OSD's maximum IOPS capacity. | |
20effc67 TL |
1065 | long_desc: Only considered for osd_op_queue = mclock_scheduler |
1066 | fmt_desc: IO limit for background best_effort over reservation. | |
1e59de90 TL |
1067 | default: 0 |
1068 | min: 0 | |
1069 | max: 1.0 | |
20effc67 TL |
1070 | see_also: |
1071 | - osd_op_queue | |
1072 | - name: osd_mclock_scheduler_anticipation_timeout | |
1073 | type: float | |
1074 | level: advanced | |
1075 | desc: mclock anticipation timeout in seconds | |
1076 | long_desc: the amount of time that mclock waits until the unused resource is forfeited | |
1077 | default: 0 | |
1e59de90 TL |
1078 | - name: osd_mclock_max_sequential_bandwidth_hdd |
1079 | type: size | |
1080 | level: basic | |
1081 | desc: The maximum sequential bandwidth in bytes/second of the OSD (for | |
1082 | rotational media) | |
1083 | long_desc: This option specifies the maximum sequential bandwidth to consider | |
1084 | for an OSD whose underlying device type is rotational media. This is | |
1085 | considered by the mclock scheduler to derive the cost factor to be used in | |
1086 | QoS calculations. Only considered for osd_op_queue = mclock_scheduler | |
1087 | fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the | |
1088 | OSD (for rotational media) | |
1089 | default: 150_M | |
20effc67 TL |
1090 | flags: |
1091 | - runtime | |
1e59de90 TL |
1092 | - name: osd_mclock_max_sequential_bandwidth_ssd |
1093 | type: size | |
1094 | level: basic | |
1095 | desc: The maximum sequential bandwidth in bytes/second of the OSD (for | |
1096 | solid state media) | |
1097 | long_desc: This option specifies the maximum sequential bandwidth to consider | |
1098 | for an OSD whose underlying device type is solid state media. This is | |
1099 | considered by the mclock scheduler to derive the cost factor to be used in | |
1100 | QoS calculations. Only considered for osd_op_queue = mclock_scheduler | |
1101 | fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the | |
1102 | OSD (for solid state media) | |
1103 | default: 1200_M | |
20effc67 TL |
1104 | flags: |
1105 | - runtime | |
1106 | - name: osd_mclock_max_capacity_iops_hdd | |
1107 | type: float | |
1108 | level: basic | |
1e59de90 TL |
1109 | desc: Max random write IOPS capacity (at 4KiB block size) to consider per OSD |
1110 | (for rotational media) | |
1111 | long_desc: This option specifies the max OSD random write IOPS capacity per | |
1112 | OSD. Contributes in QoS calculations when enabling a dmclock profile. Only | |
1113 | considered for osd_op_queue = mclock_scheduler | |
1114 | fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per | |
1115 | OSD (for rotational media) | |
20effc67 TL |
1116 | default: 315 |
1117 | flags: | |
1118 | - runtime | |
1119 | - name: osd_mclock_max_capacity_iops_ssd | |
1120 | type: float | |
1121 | level: basic | |
1e59de90 TL |
1122 | desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD |
1123 | (for solid state media) | |
1124 | long_desc: This option specifies the max OSD random write IOPS capacity per | |
1125 | OSD. Contributes in QoS calculations when enabling a dmclock profile. Only | |
1126 | considered for osd_op_queue = mclock_scheduler | |
1127 | fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per | |
1128 | OSD (for solid state media) | |
20effc67 TL |
1129 | default: 21500 |
1130 | flags: | |
1131 | - runtime | |
1132 | - name: osd_mclock_force_run_benchmark_on_init | |
1133 | type: bool | |
1134 | level: advanced | |
1135 | desc: Force run the OSD benchmark on OSD initialization/boot-up | |
1136 | long_desc: This option specifies whether the OSD benchmark must be run during | |
1137 | the OSD boot-up sequence even if historical data about the OSD iops capacity | |
1138 | is available in the MON config store. Enable this to refresh the OSD iops | |
1139 | capacity if the underlying device's performance characteristics have changed | |
1140 | significantly. Only considered for osd_op_queue = mclock_scheduler. | |
1141 | fmt_desc: Force run the OSD benchmark on OSD initialization/boot-up | |
1142 | default: false | |
1143 | see_also: | |
1144 | - osd_mclock_max_capacity_iops_hdd | |
1145 | - osd_mclock_max_capacity_iops_ssd | |
1146 | flags: | |
1147 | - startup | |
1148 | - name: osd_mclock_skip_benchmark | |
1149 | type: bool | |
1150 | level: dev | |
1151 | desc: Skip the OSD benchmark on OSD initialization/boot-up | |
1152 | long_desc: This option specifies whether the OSD benchmark must be skipped during | |
1153 | the OSD boot-up sequence. Only considered for osd_op_queue = mclock_scheduler. | |
1154 | fmt_desc: Skip the OSD benchmark on OSD initialization/boot-up | |
1155 | default: false | |
1156 | see_also: | |
1157 | - osd_mclock_max_capacity_iops_hdd | |
1158 | - osd_mclock_max_capacity_iops_ssd | |
1159 | flags: | |
1160 | - runtime | |
1161 | - name: osd_mclock_profile | |
1162 | type: str | |
1163 | level: advanced | |
1164 | desc: Which mclock profile to use | |
1165 | long_desc: This option specifies the mclock profile to enable - one among the set | |
1166 | of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler | |
1167 | fmt_desc: | | |
1168 | This sets the type of mclock profile to use for providing QoS | |
1169 | based on operations belonging to different classes (background | |
1170 | recovery, scrub, snaptrim, client op, osd subop). Once a built-in | |
1171 | profile is enabled, the lower level mclock resource control | |
1172 | parameters [*reservation, weight, limit*] and some Ceph | |
1173 | configuration parameters are set transparently. Note that the | |
1174 | above does not apply for the *custom* profile. | |
1e59de90 | 1175 | default: balanced |
20effc67 TL |
1176 | see_also: |
1177 | - osd_op_queue | |
1178 | enum_values: | |
1179 | - balanced | |
1180 | - high_recovery_ops | |
1181 | - high_client_ops | |
1182 | - custom | |
1183 | flags: | |
1184 | - runtime | |
39ae355f TL |
1185 | - name: osd_mclock_override_recovery_settings |
1186 | type: bool | |
1187 | level: advanced | |
1188 | desc: Setting this option enables the override of recovery/backfill limits | |
1189 | for the mClock scheduler. | |
1190 | long_desc: This option when set enables the override of the max recovery | |
1191 | active and the max backfills limits with mClock scheduler active. These | |
1192 | options are not modifiable when mClock scheduler is active. Any attempt | |
1193 | to modify these values without setting this option will reset the | |
1194 | recovery or backfill option back to its default value. | |
1195 | fmt_desc: Setting this option will enable the override of the | |
1196 | recovery/backfill limits for the mClock scheduler as defined by the | |
1197 | ``osd_recovery_max_active_hdd``, ``osd_recovery_max_active_ssd`` and | |
1198 | ``osd_max_backfills`` options. | |
1199 | default: false | |
1200 | see_also: | |
1201 | - osd_recovery_max_active_hdd | |
1202 | - osd_recovery_max_active_ssd | |
1203 | - osd_max_backfills | |
1204 | flags: | |
1205 | - runtime | |
1206 | - name: osd_mclock_iops_capacity_threshold_hdd | |
1207 | type: float | |
1208 | level: basic | |
1209 | desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore | |
1210 | the OSD bench results for an OSD (for rotational media) | |
1211 | long_desc: This option specifies the threshold IOPS capacity for an OSD under | |
1212 | which the OSD bench results can be considered for QoS calculations. Only | |
1213 | considered for osd_op_queue = mclock_scheduler | |
1214 | fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to | |
1215 | ignore OSD bench results for an OSD (for rotational media) | |
1216 | default: 500 | |
1217 | flags: | |
1218 | - runtime | |
1219 | - name: osd_mclock_iops_capacity_threshold_ssd | |
1220 | type: float | |
1221 | level: basic | |
1222 | desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore | |
1223 | the OSD bench results for an OSD (for solid state media) | |
1224 | long_desc: This option specifies the threshold IOPS capacity for an OSD under | |
1225 | which the OSD bench results can be considered for QoS calculations. Only | |
1226 | considered for osd_op_queue = mclock_scheduler | |
1227 | fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to | |
1228 | ignore OSD bench results for an OSD (for solid state media) | |
1229 | default: 80000 | |
1230 | flags: | |
1231 | - runtime | |
20effc67 TL |
1232 | # Set to true for testing. Users should NOT set this. |
1233 | # If set to true even after reading enough shards to | |
1234 | # decode the object, any error will be reported. | |
1235 | - name: osd_read_ec_check_for_errors | |
1236 | type: bool | |
1237 | level: advanced | |
1238 | default: false | |
1239 | with_legacy: true | |
1240 | - name: osd_recovery_delay_start | |
1241 | type: float | |
1242 | level: advanced | |
1243 | default: 0 | |
1244 | fmt_desc: After peering completes, Ceph will delay for the specified number | |
1245 | of seconds before starting to recover RADOS objects. | |
1246 | with_legacy: true | |
1247 | - name: osd_recovery_max_active | |
1248 | type: uint | |
1249 | level: advanced | |
1250 | desc: Number of simultaneous active recovery operations per OSD (overrides _ssd | |
1251 | and _hdd if non-zero) | |
1252 | fmt_desc: The number of active recovery requests per OSD at one time. More | |
1253 | requests will accelerate recovery, but the requests places an | |
1254 | increased load on the cluster. | |
1255 | note: This value is only used if it is non-zero. Normally it | |
1256 | is ``0``, which means that the ``hdd`` or ``ssd`` values | |
1257 | (below) are used, depending on the type of the primary | |
1258 | device backing the OSD. | |
1259 | default: 0 | |
1260 | see_also: | |
1261 | - osd_recovery_max_active_hdd | |
1262 | - osd_recovery_max_active_ssd | |
1263 | flags: | |
1264 | - runtime | |
1265 | with_legacy: true | |
1266 | - name: osd_recovery_max_active_hdd | |
1267 | type: uint | |
1268 | level: advanced | |
1269 | desc: Number of simultaneous active recovery operations per OSD (for rotational | |
1270 | devices) | |
1271 | fmt_desc: The number of active recovery requests per OSD at one time, if the | |
1272 | primary device is rotational. | |
1273 | default: 3 | |
1274 | see_also: | |
1275 | - osd_recovery_max_active | |
1276 | - osd_recovery_max_active_ssd | |
1277 | flags: | |
1278 | - runtime | |
1279 | with_legacy: true | |
1280 | - name: osd_recovery_max_active_ssd | |
1281 | type: uint | |
1282 | level: advanced | |
1283 | desc: Number of simultaneous active recovery operations per OSD (for non-rotational | |
1284 | solid state devices) | |
1285 | fmt_desc: The number of active recovery requests per OSD at one time, if the | |
1286 | primary device is non-rotational (i.e., an SSD). | |
1287 | default: 10 | |
1288 | see_also: | |
1289 | - osd_recovery_max_active | |
1290 | - osd_recovery_max_active_hdd | |
1291 | flags: | |
1292 | - runtime | |
1293 | with_legacy: true | |
1294 | - name: osd_recovery_max_single_start | |
1295 | type: uint | |
1296 | level: advanced | |
1297 | default: 1 | |
1298 | fmt_desc: The maximum number of recovery operations per OSD that will be | |
1299 | newly started when an OSD is recovering. | |
1300 | with_legacy: true | |
1301 | # max size of push chunk | |
1302 | - name: osd_recovery_max_chunk | |
1303 | type: size | |
1304 | level: advanced | |
1305 | default: 8_M | |
1306 | fmt_desc: the maximum total size of data chunks a recovery op can carry. | |
1307 | with_legacy: true | |
1308 | # max number of omap entries per chunk; 0 to disable limit | |
1309 | - name: osd_recovery_max_omap_entries_per_chunk | |
1310 | type: uint | |
1311 | level: advanced | |
1312 | default: 8096 | |
1313 | with_legacy: true | |
1314 | # max size of a COPYFROM chunk | |
1315 | - name: osd_copyfrom_max_chunk | |
1316 | type: size | |
1317 | level: advanced | |
1318 | default: 8_M | |
1319 | with_legacy: true | |
1320 | # push cost per object | |
1321 | - name: osd_push_per_object_cost | |
1322 | type: size | |
1323 | level: advanced | |
1324 | default: 1000 | |
1325 | fmt_desc: the overhead for serving a push op | |
1326 | with_legacy: true | |
1327 | # max size of push message | |
1328 | - name: osd_max_push_cost | |
1329 | type: size | |
1330 | level: advanced | |
1331 | default: 8_M | |
1332 | with_legacy: true | |
1333 | # max objects in single push op | |
1334 | - name: osd_max_push_objects | |
1335 | type: uint | |
1336 | level: advanced | |
1337 | default: 10 | |
1338 | with_legacy: true | |
1339 | # Only use clone_overlap for recovery if there are fewer than | |
1340 | # osd_recover_clone_overlap_limit entries in the overlap set | |
1341 | - name: osd_recover_clone_overlap_limit | |
1342 | type: uint | |
1343 | level: advanced | |
1344 | default: 10 | |
1345 | flags: | |
1346 | - runtime | |
1347 | - name: osd_debug_feed_pullee | |
1348 | type: int | |
1349 | level: dev | |
1350 | desc: Feed a pullee, and force primary to pull a currently missing object from it | |
1351 | default: -1 | |
1352 | with_legacy: true | |
1353 | - name: osd_backfill_scan_min | |
1354 | type: int | |
1355 | level: advanced | |
1356 | default: 64 | |
1357 | fmt_desc: The minimum number of objects per backfill scan. | |
1358 | with_legacy: true | |
1359 | - name: osd_backfill_scan_max | |
1360 | type: int | |
1361 | level: advanced | |
1362 | default: 512 | |
1363 | fmt_desc: The maximum number of objects per backfill scan.p | |
1364 | with_legacy: true | |
1e59de90 TL |
1365 | - name: osd_extblkdev_plugins |
1366 | type: str | |
1367 | level: advanced | |
1368 | desc: extended block device plugins to load, provide compression feedback at runtime | |
1369 | default: vdo | |
1370 | flags: | |
1371 | - startup | |
20effc67 TL |
1372 | # minimum number of peers |
1373 | - name: osd_heartbeat_min_peers | |
1374 | type: int | |
1375 | level: advanced | |
1376 | default: 10 | |
1377 | with_legacy: true | |
1378 | - name: osd_delete_sleep | |
1379 | type: float | |
1380 | level: advanced | |
1e59de90 TL |
1381 | desc: Time in seconds to sleep before next removal transaction. This setting |
1382 | overrides _ssd, _hdd, and _hybrid if non-zero. | |
20effc67 TL |
1383 | fmt_desc: Time in seconds to sleep before the next removal transaction. This |
1384 | throttles the PG deletion process. | |
1385 | default: 0 | |
1386 | flags: | |
1387 | - runtime | |
1388 | - name: osd_delete_sleep_hdd | |
1389 | type: float | |
1390 | level: advanced | |
1391 | desc: Time in seconds to sleep before next removal transaction for HDDs | |
1392 | default: 5 | |
1393 | flags: | |
1394 | - runtime | |
1395 | - name: osd_delete_sleep_ssd | |
1396 | type: float | |
1397 | level: advanced | |
1398 | desc: Time in seconds to sleep before next removal transaction for SSDs | |
1399 | default: 1 | |
1400 | flags: | |
1401 | - runtime | |
1402 | - name: osd_delete_sleep_hybrid | |
1403 | type: float | |
1404 | level: advanced | |
1405 | desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD | |
1406 | and OSD journal or WAL+DB is on SSD | |
1407 | default: 1 | |
1408 | flags: | |
1409 | - runtime | |
33c7a0ef TL |
1410 | - name: osd_rocksdb_iterator_bounds_enabled |
1411 | desc: Whether omap iterator bounds are applied to rocksdb iterator ReadOptions | |
1412 | type: bool | |
1413 | level: dev | |
1414 | default: true | |
1415 | with_legacy: true |