]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/PurgeQueue.cc
Import ceph 15.2.8
[ceph.git] / ceph / src / mds / PurgeQueue.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "common/debug.h"
16#include "mds/mdstypes.h"
17#include "mds/CInode.h"
18#include "mds/MDCache.h"
19
20#include "PurgeQueue.h"
21
f64942e4 22#include <string.h>
7c673cae
FG
23
24#define dout_context cct
25#define dout_subsys ceph_subsys_mds
26#undef dout_prefix
27#define dout_prefix _prefix(_dout, rank) << __func__ << ": "
28static ostream& _prefix(std::ostream *_dout, mds_rank_t rank) {
29 return *_dout << "mds." << rank << ".purge_queue ";
30}
31
11fdf7f2
TL
32const std::map<std::string, PurgeItem::Action> PurgeItem::actions = {
33 {"NONE", PurgeItem::NONE},
34 {"PURGE_FILE", PurgeItem::PURGE_FILE},
35 {"TRUNCATE_FILE", PurgeItem::TRUNCATE_FILE},
36 {"PURGE_DIR", PurgeItem::PURGE_DIR}
37};
38
7c673cae
FG
39void PurgeItem::encode(bufferlist &bl) const
40{
11fdf7f2
TL
41 ENCODE_START(2, 1, bl);
42 encode((uint8_t)action, bl);
43 encode(ino, bl);
44 encode(size, bl);
45 encode(layout, bl, CEPH_FEATURE_FS_FILE_LAYOUT_V2);
46 encode(old_pools, bl);
47 encode(snapc, bl);
48 encode(fragtree, bl);
49 encode(stamp, bl);
50 uint8_t static const pad = 0xff;
51 for (unsigned int i = 0; i<pad_size; i++) {
52 encode(pad, bl);
53 }
7c673cae
FG
54 ENCODE_FINISH(bl);
55}
56
11fdf7f2 57void PurgeItem::decode(bufferlist::const_iterator &p)
7c673cae 58{
11fdf7f2 59 DECODE_START(2, p);
9f95a23c
TL
60 bool done = false;
61 if (struct_v == 1) {
62 auto p_start = p;
63 try {
64 // bad encoding introduced by v13.2.2
65 decode(stamp, p);
66 decode(pad_size, p);
67 p += pad_size;
f91f0fd5
TL
68 uint8_t raw_action;
69 decode(raw_action, p);
70 action = (Action)raw_action;
9f95a23c
TL
71 decode(ino, p);
72 decode(size, p);
73 decode(layout, p);
74 decode(old_pools, p);
75 decode(snapc, p);
76 decode(fragtree, p);
77 if (p.get_off() > struct_end)
78 throw buffer::end_of_buffer();
79 done = true;
80 } catch (const buffer::error &e) {
81 p = p_start;
82 }
83 }
84 if (!done) {
f91f0fd5
TL
85 uint8_t raw_action;
86 decode(raw_action, p);
87 action = (Action)raw_action;
9f95a23c
TL
88 decode(ino, p);
89 decode(size, p);
90 decode(layout, p);
91 decode(old_pools, p);
92 decode(snapc, p);
93 decode(fragtree, p);
94 if (struct_v >= 2) {
95 decode(stamp, p);
96 }
11fdf7f2 97 }
7c673cae
FG
98 DECODE_FINISH(p);
99}
100
101// TODO: if Objecter has any slow requests, take that as a hint and
102// slow down our rate of purging (keep accepting pushes though)
103PurgeQueue::PurgeQueue(
104 CephContext *cct_,
105 mds_rank_t rank_,
106 const int64_t metadata_pool_,
107 Objecter *objecter_,
108 Context *on_error_)
109 :
110 cct(cct_),
111 rank(rank_),
7c673cae
FG
112 metadata_pool(metadata_pool_),
113 finisher(cct, "PurgeQueue", "PQ_Finisher"),
114 timer(cct, lock),
115 filer(objecter_, &finisher),
116 objecter(objecter_),
117 journaler("pq", MDS_INO_PURGE_QUEUE + rank, metadata_pool,
118 CEPH_FS_ONDISK_MAGIC, objecter_, nullptr, 0,
119 &finisher),
9f95a23c 120 on_error(on_error_)
7c673cae 121{
11fdf7f2
TL
122 ceph_assert(cct != nullptr);
123 ceph_assert(on_error != nullptr);
124 ceph_assert(objecter != nullptr);
7c673cae
FG
125 journaler.set_write_error_handler(on_error);
126}
127
128PurgeQueue::~PurgeQueue()
129{
130 if (logger) {
131 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
132 }
f64942e4 133 delete on_error;
7c673cae
FG
134}
135
136void PurgeQueue::create_logger()
137{
91327a77
AA
138 PerfCountersBuilder pcb(g_ceph_context, "purge_queue", l_pq_first, l_pq_last);
139
140 pcb.add_u64_counter(l_pq_executed, "pq_executed", "Purge queue tasks executed",
141 "purg", PerfCountersBuilder::PRIO_INTERESTING);
142
143 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
7c673cae 144 pcb.add_u64(l_pq_executing_ops, "pq_executing_ops", "Purge queue ops in flight");
92f5a8d4 145 pcb.add_u64(l_pq_executing_ops_high_water, "pq_executing_ops_high_water", "Maximum number of executing file purge ops");
7c673cae 146 pcb.add_u64(l_pq_executing, "pq_executing", "Purge queue tasks in flight");
92f5a8d4 147 pcb.add_u64(l_pq_executing_high_water, "pq_executing_high_water", "Maximum number of executing file purges");
9f95a23c 148 pcb.add_u64(l_pq_item_in_journal, "pq_item_in_journal", "Purge item left in journal");
7c673cae
FG
149
150 logger.reset(pcb.create_perf_counters());
151 g_ceph_context->get_perfcounters_collection()->add(logger.get());
152}
153
154void PurgeQueue::init()
155{
11fdf7f2 156 std::lock_guard l(lock);
7c673cae 157
11fdf7f2 158 ceph_assert(logger != nullptr);
7c673cae
FG
159
160 finisher.start();
161 timer.init();
162}
163
c07f9fc5
FG
164void PurgeQueue::activate()
165{
11fdf7f2 166 std::lock_guard l(lock);
f64942e4 167
9f95a23c
TL
168 {
169 PurgeItem item;
170 bufferlist bl;
171
172 // calculate purge item serialized size stored in journal
173 // used to count how many items still left in journal later
174 ::encode(item, bl);
175 purge_item_journal_size = bl.length() + journaler.get_journal_envelope_size();
176 }
177
f64942e4
AA
178 if (readonly) {
179 dout(10) << "skipping activate: PurgeQueue is readonly" << dendl;
180 return;
181 }
182
c07f9fc5
FG
183 if (journaler.get_read_pos() == journaler.get_write_pos())
184 return;
185
186 if (in_flight.empty()) {
187 dout(4) << "start work (by drain)" << dendl;
9f95a23c 188 finisher.queue(new LambdaContext([this](int r) {
11fdf7f2 189 std::lock_guard l(lock);
c07f9fc5
FG
190 _consume();
191 }));
192 }
193}
194
7c673cae
FG
195void PurgeQueue::shutdown()
196{
11fdf7f2 197 std::lock_guard l(lock);
7c673cae
FG
198
199 journaler.shutdown();
200 timer.shutdown();
201 finisher.stop();
202}
203
204void PurgeQueue::open(Context *completion)
205{
206 dout(4) << "opening" << dendl;
207
11fdf7f2 208 std::lock_guard l(lock);
7c673cae 209
3efd9988
FG
210 if (completion)
211 waiting_for_recovery.push_back(completion);
212
9f95a23c 213 journaler.recover(new LambdaContext([this](int r){
7c673cae
FG
214 if (r == -ENOENT) {
215 dout(1) << "Purge Queue not found, assuming this is an upgrade and "
216 "creating it." << dendl;
3efd9988 217 create(NULL);
7c673cae 218 } else if (r == 0) {
11fdf7f2 219 std::lock_guard l(lock);
7c673cae
FG
220 dout(4) << "open complete" << dendl;
221
222 // Journaler only guarantees entries before head write_pos have been
223 // fully flushed. Before appending new entries, we need to find and
224 // drop any partial written entry.
225 if (journaler.last_committed.write_pos < journaler.get_write_pos()) {
226 dout(4) << "recovering write_pos" << dendl;
227 journaler.set_read_pos(journaler.last_committed.write_pos);
3efd9988 228 _recover();
7c673cae
FG
229 return;
230 }
231
232 journaler.set_writeable();
3efd9988
FG
233 recovered = true;
234 finish_contexts(g_ceph_context, waiting_for_recovery);
7c673cae
FG
235 } else {
236 derr << "Error " << r << " loading Journaler" << dendl;
f64942e4 237 _go_readonly(r);
7c673cae
FG
238 }
239 }));
240}
241
3efd9988
FG
242void PurgeQueue::wait_for_recovery(Context* c)
243{
11fdf7f2 244 std::lock_guard l(lock);
f64942e4 245 if (recovered) {
3efd9988 246 c->complete(0);
f64942e4
AA
247 } else if (readonly) {
248 dout(10) << "cannot wait for recovery: PurgeQueue is readonly" << dendl;
249 c->complete(-EROFS);
250 } else {
3efd9988 251 waiting_for_recovery.push_back(c);
f64942e4 252 }
3efd9988 253}
7c673cae 254
3efd9988 255void PurgeQueue::_recover()
7c673cae 256{
9f95a23c 257 ceph_assert(ceph_mutex_is_locked_by_me(lock));
7c673cae
FG
258
259 // Journaler::is_readable() adjusts write_pos if partial entry is encountered
260 while (1) {
261 if (!journaler.is_readable() &&
262 !journaler.get_error() &&
263 journaler.get_read_pos() < journaler.get_write_pos()) {
9f95a23c 264 journaler.wait_for_readable(new LambdaContext([this](int r) {
11fdf7f2 265 std::lock_guard l(lock);
3efd9988 266 _recover();
7c673cae
FG
267 }));
268 return;
269 }
270
271 if (journaler.get_error()) {
272 int r = journaler.get_error();
273 derr << "Error " << r << " recovering write_pos" << dendl;
f64942e4 274 _go_readonly(r);
7c673cae
FG
275 return;
276 }
277
278 if (journaler.get_read_pos() == journaler.get_write_pos()) {
279 dout(4) << "write_pos recovered" << dendl;
280 // restore original read_pos
281 journaler.set_read_pos(journaler.last_committed.expire_pos);
282 journaler.set_writeable();
3efd9988
FG
283 recovered = true;
284 finish_contexts(g_ceph_context, waiting_for_recovery);
7c673cae
FG
285 return;
286 }
287
288 bufferlist bl;
289 bool readable = journaler.try_read_entry(bl);
11fdf7f2 290 ceph_assert(readable); // we checked earlier
7c673cae
FG
291 }
292}
293
294void PurgeQueue::create(Context *fin)
295{
296 dout(4) << "creating" << dendl;
11fdf7f2 297 std::lock_guard l(lock);
7c673cae 298
3efd9988
FG
299 if (fin)
300 waiting_for_recovery.push_back(fin);
301
7c673cae
FG
302 file_layout_t layout = file_layout_t::get_default();
303 layout.pool_id = metadata_pool;
304 journaler.set_writeable();
305 journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
9f95a23c 306 journaler.write_head(new LambdaContext([this](int r) {
11fdf7f2 307 std::lock_guard l(lock);
f64942e4
AA
308 if (r) {
309 _go_readonly(r);
310 } else {
311 recovered = true;
312 finish_contexts(g_ceph_context, waiting_for_recovery);
313 }
3efd9988 314 }));
7c673cae
FG
315}
316
317/**
318 * The `completion` context will always be called back via a Finisher
319 */
320void PurgeQueue::push(const PurgeItem &pi, Context *completion)
321{
11fdf7f2
TL
322 dout(4) << "pushing inode " << pi.ino << dendl;
323 std::lock_guard l(lock);
7c673cae 324
f64942e4
AA
325 if (readonly) {
326 dout(10) << "cannot push inode: PurgeQueue is readonly" << dendl;
327 completion->complete(-EROFS);
328 return;
329 }
330
7c673cae 331 // Callers should have waited for open() before using us
11fdf7f2 332 ceph_assert(!journaler.is_readonly());
7c673cae
FG
333
334 bufferlist bl;
335
11fdf7f2 336 encode(pi, bl);
7c673cae
FG
337 journaler.append_entry(bl);
338 journaler.wait_for_flush(completion);
339
340 // Maybe go ahead and do something with it right away
341 bool could_consume = _consume();
342 if (!could_consume) {
343 // Usually, it is not necessary to explicitly flush here, because the reader
344 // will get flushes generated inside Journaler::is_readable. However,
f64942e4 345 // if we remain in a _can_consume()==false state for a long period then
7c673cae
FG
346 // we should flush in order to allow MDCache to drop its strays rather
347 // than having them wait for purgequeue to progress.
348 if (!delayed_flush) {
9f95a23c 349 delayed_flush = new LambdaContext([this](int r){
7c673cae
FG
350 delayed_flush = nullptr;
351 journaler.flush();
352 });
353
354 timer.add_event_after(
11fdf7f2 355 g_conf()->mds_purge_queue_busy_flush_period,
7c673cae
FG
356 delayed_flush);
357 }
358 }
359}
360
361uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const
362{
363 uint32_t ops_required = 0;
364 if (item.action == PurgeItem::PURGE_DIR) {
365 // Directory, count dirfrags to be deleted
11fdf7f2 366 frag_vec_t leaves;
7c673cae 367 if (!item.fragtree.is_leaf(frag_t())) {
11fdf7f2 368 item.fragtree.get_leaves(leaves);
7c673cae
FG
369 }
370 // One for the root, plus any leaves
11fdf7f2 371 ops_required = 1 + leaves.size();
7c673cae
FG
372 } else {
373 // File, work out concurrent Filer::purge deletes
f91f0fd5 374 // Account for removing (or zeroing) backtrace
7c673cae
FG
375 const uint64_t num = (item.size > 0) ?
376 Striper::get_num_objects(item.layout, item.size) : 1;
377
11fdf7f2 378 ops_required = std::min(num, g_conf()->filer_max_purge_ops);
7c673cae 379
7c673cae
FG
380 // Account for deletions for old pools
381 if (item.action != PurgeItem::TRUNCATE_FILE) {
382 ops_required += item.old_pools.size();
383 }
384 }
385
386 return ops_required;
387}
388
f64942e4 389bool PurgeQueue::_can_consume()
7c673cae 390{
f64942e4
AA
391 if (readonly) {
392 dout(10) << "can't consume: PurgeQueue is readonly" << dendl;
393 return false;
394 }
395
7c673cae 396 dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, "
11fdf7f2 397 << in_flight.size() << "/" << g_conf()->mds_max_purge_files
7c673cae
FG
398 << " files" << dendl;
399
400 if (in_flight.size() == 0 && cct->_conf->mds_max_purge_files > 0) {
401 // Always permit consumption if nothing is in flight, so that the ops
402 // limit can never be so low as to forbid all progress (unless
403 // administrator has deliberately paused purging by setting max
404 // purge files to zero).
405 return true;
406 }
407
408 if (ops_in_flight >= max_purge_ops) {
409 dout(20) << "Throttling on op limit " << ops_in_flight << "/"
410 << max_purge_ops << dendl;
411 return false;
412 }
413
414 if (in_flight.size() >= cct->_conf->mds_max_purge_files) {
415 dout(20) << "Throttling on item limit " << in_flight.size()
416 << "/" << cct->_conf->mds_max_purge_files << dendl;
417 return false;
418 } else {
419 return true;
420 }
421}
422
f64942e4
AA
423void PurgeQueue::_go_readonly(int r)
424{
425 if (readonly) return;
426 dout(1) << "going readonly because internal IO failed: " << strerror(-r) << dendl;
427 readonly = true;
9f95a23c 428 finisher.queue(on_error, r);
f64942e4
AA
429 on_error = nullptr;
430 journaler.set_readonly();
431 finish_contexts(g_ceph_context, waiting_for_recovery, r);
432}
433
7c673cae
FG
434bool PurgeQueue::_consume()
435{
9f95a23c 436 ceph_assert(ceph_mutex_is_locked_by_me(lock));
7c673cae
FG
437
438 bool could_consume = false;
f64942e4 439 while(_can_consume()) {
7c673cae
FG
440
441 if (delayed_flush) {
442 // We are now going to read from the journal, so any proactive
443 // flush is no longer necessary. This is not functionally necessary
444 // but it can avoid generating extra fragmented flush IOs.
445 timer.cancel_event(delayed_flush);
446 delayed_flush = nullptr;
447 }
448
1adf2230
AA
449 if (int r = journaler.get_error()) {
450 derr << "Error " << r << " recovering write_pos" << dendl;
f64942e4 451 _go_readonly(r);
1adf2230
AA
452 return could_consume;
453 }
454
7c673cae
FG
455 if (!journaler.is_readable()) {
456 dout(10) << " not readable right now" << dendl;
457 // Because we are the writer and the reader of the journal
458 // via the same Journaler instance, we never need to reread_head
459 if (!journaler.have_waiter()) {
9f95a23c 460 journaler.wait_for_readable(new LambdaContext([this](int r) {
11fdf7f2 461 std::lock_guard l(lock);
7c673cae
FG
462 if (r == 0) {
463 _consume();
1adf2230 464 } else if (r != -EAGAIN) {
f64942e4 465 _go_readonly(r);
7c673cae
FG
466 }
467 }));
468 }
469
470 return could_consume;
471 }
472
28e407b8 473 could_consume = true;
7c673cae
FG
474 // The journaler is readable: consume an entry
475 bufferlist bl;
476 bool readable = journaler.try_read_entry(bl);
11fdf7f2 477 ceph_assert(readable); // we checked earlier
7c673cae
FG
478
479 dout(20) << " decoding entry" << dendl;
480 PurgeItem item;
11fdf7f2 481 auto q = bl.cbegin();
7c673cae 482 try {
11fdf7f2 483 decode(item, q);
7c673cae
FG
484 } catch (const buffer::error &err) {
485 derr << "Decode error at read_pos=0x" << std::hex
486 << journaler.get_read_pos() << dendl;
f64942e4 487 _go_readonly(EIO);
7c673cae 488 }
11fdf7f2 489 dout(20) << " executing item (" << item.ino << ")" << dendl;
7c673cae
FG
490 _execute_item(item, journaler.get_read_pos());
491 }
492
493 dout(10) << " cannot consume right now" << dendl;
494
495 return could_consume;
496}
497
498void PurgeQueue::_execute_item(
499 const PurgeItem &item,
500 uint64_t expire_to)
501{
9f95a23c 502 ceph_assert(ceph_mutex_is_locked_by_me(lock));
7c673cae
FG
503
504 in_flight[expire_to] = item;
505 logger->set(l_pq_executing, in_flight.size());
92f5a8d4
TL
506 files_high_water = std::max(files_high_water, in_flight.size());
507 logger->set(l_pq_executing_high_water, files_high_water);
f64942e4
AA
508 auto ops = _calculate_ops(item);
509 ops_in_flight += ops;
7c673cae 510 logger->set(l_pq_executing_ops, ops_in_flight);
92f5a8d4
TL
511 ops_high_water = std::max(ops_high_water, ops_in_flight);
512 logger->set(l_pq_executing_ops_high_water, ops_high_water);
7c673cae
FG
513
514 SnapContext nullsnapc;
515
516 C_GatherBuilder gather(cct);
517 if (item.action == PurgeItem::PURGE_FILE) {
518 if (item.size > 0) {
519 uint64_t num = Striper::get_num_objects(item.layout, item.size);
520 dout(10) << " 0~" << item.size << " objects 0~" << num
521 << " snapc " << item.snapc << " on " << item.ino << dendl;
522 filer.purge_range(item.ino, &item.layout, item.snapc,
523 0, num, ceph::real_clock::now(), 0,
524 gather.new_sub());
525 }
526
527 // remove the backtrace object if it was not purged
528 object_t oid = CInode::get_object_name(item.ino, frag_t(), "");
529 if (!gather.has_subs() || !item.layout.pool_ns.empty()) {
530 object_locator_t oloc(item.layout.pool_id);
531 dout(10) << " remove backtrace object " << oid
532 << " pool " << oloc.pool << " snapc " << item.snapc << dendl;
533 objecter->remove(oid, oloc, item.snapc,
534 ceph::real_clock::now(), 0,
535 gather.new_sub());
536 }
537
538 // remove old backtrace objects
539 for (const auto &p : item.old_pools) {
540 object_locator_t oloc(p);
541 dout(10) << " remove backtrace object " << oid
542 << " old pool " << p << " snapc " << item.snapc << dendl;
543 objecter->remove(oid, oloc, item.snapc,
544 ceph::real_clock::now(), 0,
545 gather.new_sub());
546 }
547 } else if (item.action == PurgeItem::PURGE_DIR) {
548 object_locator_t oloc(metadata_pool);
11fdf7f2 549 frag_vec_t leaves;
7c673cae 550 if (!item.fragtree.is_leaf(frag_t()))
11fdf7f2
TL
551 item.fragtree.get_leaves(leaves);
552 leaves.push_back(frag_t());
553 for (const auto &leaf : leaves) {
554 object_t oid = CInode::get_object_name(item.ino, leaf, "");
7c673cae
FG
555 dout(10) << " remove dirfrag " << oid << dendl;
556 objecter->remove(oid, oloc, nullsnapc,
557 ceph::real_clock::now(),
558 0, gather.new_sub());
559 }
560 } else if (item.action == PurgeItem::TRUNCATE_FILE) {
561 const uint64_t num = Striper::get_num_objects(item.layout, item.size);
562 dout(10) << " 0~" << item.size << " objects 0~" << num
563 << " snapc " << item.snapc << " on " << item.ino << dendl;
564
565 // keep backtrace object
566 if (num > 1) {
567 filer.purge_range(item.ino, &item.layout, item.snapc,
568 1, num - 1, ceph::real_clock::now(),
569 0, gather.new_sub());
570 }
571 filer.zero(item.ino, &item.layout, item.snapc,
572 0, item.layout.object_size,
573 ceph::real_clock::now(),
574 0, true, gather.new_sub());
575 } else {
576 derr << "Invalid item (action=" << item.action << ") in purge queue, "
577 "dropping it" << dendl;
f64942e4
AA
578 ops_in_flight -= ops;
579 logger->set(l_pq_executing_ops, ops_in_flight);
92f5a8d4
TL
580 ops_high_water = std::max(ops_high_water, ops_in_flight);
581 logger->set(l_pq_executing_ops_high_water, ops_high_water);
7c673cae
FG
582 in_flight.erase(expire_to);
583 logger->set(l_pq_executing, in_flight.size());
92f5a8d4
TL
584 files_high_water = std::max(files_high_water, in_flight.size());
585 logger->set(l_pq_executing_high_water, files_high_water);
7c673cae
FG
586 return;
587 }
11fdf7f2 588 ceph_assert(gather.has_subs());
7c673cae 589
31f18b77 590 gather.set_finisher(new C_OnFinisher(
9f95a23c 591 new LambdaContext([this, expire_to](int r){
11fdf7f2 592 std::lock_guard l(lock);
7c673cae 593
e306af50
TL
594 if (r == -EBLACKLISTED) {
595 finisher.queue(on_error, r);
596 on_error = nullptr;
597 return;
598 }
599
600 _execute_item_complete(expire_to);
31f18b77 601 _consume();
7c673cae
FG
602
603 // Have we gone idle? If so, do an extra write_head now instead of
604 // waiting for next flush after journaler_write_head_interval.
605 // Also do this periodically even if not idle, so that the persisted
606 // expire_pos doesn't fall too far behind our progress when consuming
607 // a very long queue.
9f95a23c
TL
608 if (!readonly &&
609 (in_flight.empty() || journaler.write_head_needed())) {
f64942e4 610 journaler.write_head(nullptr);
7c673cae 611 }
31f18b77
FG
612 }), &finisher));
613
7c673cae
FG
614 gather.activate();
615}
616
617void PurgeQueue::_execute_item_complete(
618 uint64_t expire_to)
619{
9f95a23c 620 ceph_assert(ceph_mutex_is_locked_by_me(lock));
7c673cae 621 dout(10) << "complete at 0x" << std::hex << expire_to << std::dec << dendl;
11fdf7f2 622 ceph_assert(in_flight.count(expire_to) == 1);
7c673cae
FG
623
624 auto iter = in_flight.find(expire_to);
11fdf7f2 625 ceph_assert(iter != in_flight.end());
7c673cae 626 if (iter == in_flight.begin()) {
11fdf7f2
TL
627 uint64_t pos = expire_to;
628 if (!pending_expire.empty()) {
629 auto n = iter;
630 ++n;
631 if (n == in_flight.end()) {
632 pos = *pending_expire.rbegin();
633 pending_expire.clear();
634 } else {
635 auto p = pending_expire.begin();
636 do {
637 if (*p >= n->first)
638 break;
639 pos = *p;
640 pending_expire.erase(p++);
641 } while (p != pending_expire.end());
642 }
643 }
644 dout(10) << "expiring to 0x" << std::hex << pos << std::dec << dendl;
645 journaler.set_expire_pos(pos);
7c673cae
FG
646 } else {
647 // This is completely fine, we're not supposed to purge files in
648 // order when doing them in parallel.
649 dout(10) << "non-sequential completion, not expiring anything" << dendl;
11fdf7f2 650 pending_expire.insert(expire_to);
7c673cae
FG
651 }
652
653 ops_in_flight -= _calculate_ops(iter->second);
654 logger->set(l_pq_executing_ops, ops_in_flight);
92f5a8d4
TL
655 ops_high_water = std::max(ops_high_water, ops_in_flight);
656 logger->set(l_pq_executing_ops_high_water, ops_high_water);
7c673cae 657
11fdf7f2 658 dout(10) << "completed item for ino " << iter->second.ino << dendl;
7c673cae
FG
659
660 in_flight.erase(iter);
661 logger->set(l_pq_executing, in_flight.size());
92f5a8d4
TL
662 files_high_water = std::max(files_high_water, in_flight.size());
663 logger->set(l_pq_executing_high_water, files_high_water);
7c673cae
FG
664 dout(10) << "in_flight.size() now " << in_flight.size() << dendl;
665
9f95a23c
TL
666 uint64_t write_pos = journaler.get_write_pos();
667 uint64_t read_pos = journaler.get_read_pos();
668 uint64_t expire_pos = journaler.get_expire_pos();
669 uint64_t item_num = (write_pos - (in_flight.size() ? expire_pos : read_pos))
670 / purge_item_journal_size;
671 dout(10) << "left purge items in journal: " << item_num
672 << " (purge_item_journal_size/write_pos/read_pos/expire_pos) now at "
673 << "(" << purge_item_journal_size << "/" << write_pos << "/" << read_pos
674 << "/" << expire_pos << ")" << dendl;
675
676 logger->set(l_pq_item_in_journal, item_num);
7c673cae
FG
677 logger->inc(l_pq_executed);
678}
679
680void PurgeQueue::update_op_limit(const MDSMap &mds_map)
681{
11fdf7f2 682 std::lock_guard l(lock);
7c673cae 683
f64942e4
AA
684 if (readonly) {
685 dout(10) << "skipping; PurgeQueue is readonly" << dendl;
686 return;
687 }
688
7c673cae
FG
689 uint64_t pg_count = 0;
690 objecter->with_osdmap([&](const OSDMap& o) {
691 // Number of PGs across all data pools
31f18b77 692 const std::vector<int64_t> &data_pools = mds_map.get_data_pools();
7c673cae
FG
693 for (const auto dp : data_pools) {
694 if (o.get_pg_pool(dp) == NULL) {
695 // It is possible that we have an older OSDMap than MDSMap,
696 // because we don't start watching every OSDMap until after
697 // MDSRank is initialized
698 dout(4) << " data pool " << dp << " not found in OSDMap" << dendl;
699 continue;
700 }
701 pg_count += o.get_pg_num(dp);
702 }
703 });
704
705 // Work out a limit based on n_pgs / n_mdss, multiplied by the user's
706 // preference for how many ops per PG
707 max_purge_ops = uint64_t(((double)pg_count / (double)mds_map.get_max_mds()) *
708 cct->_conf->mds_max_purge_ops_per_pg);
709
710 // User may also specify a hard limit, apply this if so.
711 if (cct->_conf->mds_max_purge_ops) {
11fdf7f2 712 max_purge_ops = std::min(max_purge_ops, cct->_conf->mds_max_purge_ops);
7c673cae
FG
713 }
714}
715
92f5a8d4 716void PurgeQueue::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
7c673cae
FG
717{
718 if (changed.count("mds_max_purge_ops")
719 || changed.count("mds_max_purge_ops_per_pg")) {
720 update_op_limit(mds_map);
721 } else if (changed.count("mds_max_purge_files")) {
11fdf7f2 722 std::lock_guard l(lock);
7c673cae
FG
723 if (in_flight.empty()) {
724 // We might have gone from zero to a finite limit, so
725 // might need to kick off consume.
726 dout(4) << "maybe start work again (max_purge_files="
92f5a8d4 727 << g_conf()->mds_max_purge_files << dendl;
9f95a23c 728 finisher.queue(new LambdaContext([this](int r){
11fdf7f2 729 std::lock_guard l(lock);
7c673cae
FG
730 _consume();
731 }));
732 }
733 }
734}
735
736bool PurgeQueue::drain(
737 uint64_t *progress,
738 uint64_t *progress_total,
739 size_t *in_flight_count
740 )
741{
11fdf7f2 742 std::lock_guard l(lock);
f64942e4
AA
743
744 if (readonly) {
745 dout(10) << "skipping drain; PurgeQueue is readonly" << dendl;
746 return true;
747 }
748
11fdf7f2
TL
749 ceph_assert(progress != nullptr);
750 ceph_assert(progress_total != nullptr);
751 ceph_assert(in_flight_count != nullptr);
7c673cae
FG
752
753 const bool done = in_flight.empty() && (
754 journaler.get_read_pos() == journaler.get_write_pos());
755 if (done) {
756 return true;
757 }
758
759 const uint64_t bytes_remaining = journaler.get_write_pos()
760 - journaler.get_read_pos();
761
762 if (!draining) {
763 // Start of draining: remember how much there was outstanding at
764 // this point so that we can give a progress percentage later
765 draining = true;
766
767 // Life the op throttle as this daemon now has nothing to do but
768 // drain the purge queue, so do it as fast as we can.
769 max_purge_ops = 0xffff;
770 }
771
11fdf7f2 772 drain_initial = std::max(bytes_remaining, drain_initial);
7c673cae
FG
773
774 *progress = drain_initial - bytes_remaining;
775 *progress_total = drain_initial;
776 *in_flight_count = in_flight.size();
777
778 return false;
779}
780
11fdf7f2
TL
781std::string_view PurgeItem::get_type_str() const
782{
783 switch(action) {
784 case PurgeItem::NONE: return "NONE";
785 case PurgeItem::PURGE_FILE: return "PURGE_FILE";
786 case PurgeItem::PURGE_DIR: return "PURGE_DIR";
787 case PurgeItem::TRUNCATE_FILE: return "TRUNCATE_FILE";
788 default:
789 return "UNKNOWN";
790 }
791}
792