]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/PurgeQueue.cc
import 15.2.4
[ceph.git] / ceph / src / mds / PurgeQueue.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "common/debug.h"
16#include "mds/mdstypes.h"
17#include "mds/CInode.h"
18#include "mds/MDCache.h"
19
20#include "PurgeQueue.h"
21
f64942e4 22#include <string.h>
7c673cae
FG
23
24#define dout_context cct
25#define dout_subsys ceph_subsys_mds
26#undef dout_prefix
27#define dout_prefix _prefix(_dout, rank) << __func__ << ": "
28static ostream& _prefix(std::ostream *_dout, mds_rank_t rank) {
29 return *_dout << "mds." << rank << ".purge_queue ";
30}
31
11fdf7f2
TL
32const std::map<std::string, PurgeItem::Action> PurgeItem::actions = {
33 {"NONE", PurgeItem::NONE},
34 {"PURGE_FILE", PurgeItem::PURGE_FILE},
35 {"TRUNCATE_FILE", PurgeItem::TRUNCATE_FILE},
36 {"PURGE_DIR", PurgeItem::PURGE_DIR}
37};
38
7c673cae
FG
39void PurgeItem::encode(bufferlist &bl) const
40{
11fdf7f2
TL
41 ENCODE_START(2, 1, bl);
42 encode((uint8_t)action, bl);
43 encode(ino, bl);
44 encode(size, bl);
45 encode(layout, bl, CEPH_FEATURE_FS_FILE_LAYOUT_V2);
46 encode(old_pools, bl);
47 encode(snapc, bl);
48 encode(fragtree, bl);
49 encode(stamp, bl);
50 uint8_t static const pad = 0xff;
51 for (unsigned int i = 0; i<pad_size; i++) {
52 encode(pad, bl);
53 }
7c673cae
FG
54 ENCODE_FINISH(bl);
55}
56
11fdf7f2 57void PurgeItem::decode(bufferlist::const_iterator &p)
7c673cae 58{
11fdf7f2 59 DECODE_START(2, p);
9f95a23c
TL
60 bool done = false;
61 if (struct_v == 1) {
62 auto p_start = p;
63 try {
64 // bad encoding introduced by v13.2.2
65 decode(stamp, p);
66 decode(pad_size, p);
67 p += pad_size;
68 decode((uint8_t&)action, p);
69 decode(ino, p);
70 decode(size, p);
71 decode(layout, p);
72 decode(old_pools, p);
73 decode(snapc, p);
74 decode(fragtree, p);
75 if (p.get_off() > struct_end)
76 throw buffer::end_of_buffer();
77 done = true;
78 } catch (const buffer::error &e) {
79 p = p_start;
80 }
81 }
82 if (!done) {
83 decode((uint8_t&)action, p);
84 decode(ino, p);
85 decode(size, p);
86 decode(layout, p);
87 decode(old_pools, p);
88 decode(snapc, p);
89 decode(fragtree, p);
90 if (struct_v >= 2) {
91 decode(stamp, p);
92 }
11fdf7f2 93 }
7c673cae
FG
94 DECODE_FINISH(p);
95}
96
97// TODO: if Objecter has any slow requests, take that as a hint and
98// slow down our rate of purging (keep accepting pushes though)
99PurgeQueue::PurgeQueue(
100 CephContext *cct_,
101 mds_rank_t rank_,
102 const int64_t metadata_pool_,
103 Objecter *objecter_,
104 Context *on_error_)
105 :
106 cct(cct_),
107 rank(rank_),
7c673cae
FG
108 metadata_pool(metadata_pool_),
109 finisher(cct, "PurgeQueue", "PQ_Finisher"),
110 timer(cct, lock),
111 filer(objecter_, &finisher),
112 objecter(objecter_),
113 journaler("pq", MDS_INO_PURGE_QUEUE + rank, metadata_pool,
114 CEPH_FS_ONDISK_MAGIC, objecter_, nullptr, 0,
115 &finisher),
9f95a23c 116 on_error(on_error_)
7c673cae 117{
11fdf7f2
TL
118 ceph_assert(cct != nullptr);
119 ceph_assert(on_error != nullptr);
120 ceph_assert(objecter != nullptr);
7c673cae
FG
121 journaler.set_write_error_handler(on_error);
122}
123
124PurgeQueue::~PurgeQueue()
125{
126 if (logger) {
127 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
128 }
f64942e4 129 delete on_error;
7c673cae
FG
130}
131
132void PurgeQueue::create_logger()
133{
91327a77
AA
134 PerfCountersBuilder pcb(g_ceph_context, "purge_queue", l_pq_first, l_pq_last);
135
136 pcb.add_u64_counter(l_pq_executed, "pq_executed", "Purge queue tasks executed",
137 "purg", PerfCountersBuilder::PRIO_INTERESTING);
138
139 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
7c673cae 140 pcb.add_u64(l_pq_executing_ops, "pq_executing_ops", "Purge queue ops in flight");
92f5a8d4 141 pcb.add_u64(l_pq_executing_ops_high_water, "pq_executing_ops_high_water", "Maximum number of executing file purge ops");
7c673cae 142 pcb.add_u64(l_pq_executing, "pq_executing", "Purge queue tasks in flight");
92f5a8d4 143 pcb.add_u64(l_pq_executing_high_water, "pq_executing_high_water", "Maximum number of executing file purges");
9f95a23c 144 pcb.add_u64(l_pq_item_in_journal, "pq_item_in_journal", "Purge item left in journal");
7c673cae
FG
145
146 logger.reset(pcb.create_perf_counters());
147 g_ceph_context->get_perfcounters_collection()->add(logger.get());
148}
149
150void PurgeQueue::init()
151{
11fdf7f2 152 std::lock_guard l(lock);
7c673cae 153
11fdf7f2 154 ceph_assert(logger != nullptr);
7c673cae
FG
155
156 finisher.start();
157 timer.init();
158}
159
c07f9fc5
FG
160void PurgeQueue::activate()
161{
11fdf7f2 162 std::lock_guard l(lock);
f64942e4 163
9f95a23c
TL
164 {
165 PurgeItem item;
166 bufferlist bl;
167
168 // calculate purge item serialized size stored in journal
169 // used to count how many items still left in journal later
170 ::encode(item, bl);
171 purge_item_journal_size = bl.length() + journaler.get_journal_envelope_size();
172 }
173
f64942e4
AA
174 if (readonly) {
175 dout(10) << "skipping activate: PurgeQueue is readonly" << dendl;
176 return;
177 }
178
c07f9fc5
FG
179 if (journaler.get_read_pos() == journaler.get_write_pos())
180 return;
181
182 if (in_flight.empty()) {
183 dout(4) << "start work (by drain)" << dendl;
9f95a23c 184 finisher.queue(new LambdaContext([this](int r) {
11fdf7f2 185 std::lock_guard l(lock);
c07f9fc5
FG
186 _consume();
187 }));
188 }
189}
190
7c673cae
FG
191void PurgeQueue::shutdown()
192{
11fdf7f2 193 std::lock_guard l(lock);
7c673cae
FG
194
195 journaler.shutdown();
196 timer.shutdown();
197 finisher.stop();
198}
199
200void PurgeQueue::open(Context *completion)
201{
202 dout(4) << "opening" << dendl;
203
11fdf7f2 204 std::lock_guard l(lock);
7c673cae 205
3efd9988
FG
206 if (completion)
207 waiting_for_recovery.push_back(completion);
208
9f95a23c 209 journaler.recover(new LambdaContext([this](int r){
7c673cae
FG
210 if (r == -ENOENT) {
211 dout(1) << "Purge Queue not found, assuming this is an upgrade and "
212 "creating it." << dendl;
3efd9988 213 create(NULL);
7c673cae 214 } else if (r == 0) {
11fdf7f2 215 std::lock_guard l(lock);
7c673cae
FG
216 dout(4) << "open complete" << dendl;
217
218 // Journaler only guarantees entries before head write_pos have been
219 // fully flushed. Before appending new entries, we need to find and
220 // drop any partial written entry.
221 if (journaler.last_committed.write_pos < journaler.get_write_pos()) {
222 dout(4) << "recovering write_pos" << dendl;
223 journaler.set_read_pos(journaler.last_committed.write_pos);
3efd9988 224 _recover();
7c673cae
FG
225 return;
226 }
227
228 journaler.set_writeable();
3efd9988
FG
229 recovered = true;
230 finish_contexts(g_ceph_context, waiting_for_recovery);
7c673cae
FG
231 } else {
232 derr << "Error " << r << " loading Journaler" << dendl;
f64942e4 233 _go_readonly(r);
7c673cae
FG
234 }
235 }));
236}
237
3efd9988
FG
238void PurgeQueue::wait_for_recovery(Context* c)
239{
11fdf7f2 240 std::lock_guard l(lock);
f64942e4 241 if (recovered) {
3efd9988 242 c->complete(0);
f64942e4
AA
243 } else if (readonly) {
244 dout(10) << "cannot wait for recovery: PurgeQueue is readonly" << dendl;
245 c->complete(-EROFS);
246 } else {
3efd9988 247 waiting_for_recovery.push_back(c);
f64942e4 248 }
3efd9988 249}
7c673cae 250
3efd9988 251void PurgeQueue::_recover()
7c673cae 252{
9f95a23c 253 ceph_assert(ceph_mutex_is_locked_by_me(lock));
7c673cae
FG
254
255 // Journaler::is_readable() adjusts write_pos if partial entry is encountered
256 while (1) {
257 if (!journaler.is_readable() &&
258 !journaler.get_error() &&
259 journaler.get_read_pos() < journaler.get_write_pos()) {
9f95a23c 260 journaler.wait_for_readable(new LambdaContext([this](int r) {
11fdf7f2 261 std::lock_guard l(lock);
3efd9988 262 _recover();
7c673cae
FG
263 }));
264 return;
265 }
266
267 if (journaler.get_error()) {
268 int r = journaler.get_error();
269 derr << "Error " << r << " recovering write_pos" << dendl;
f64942e4 270 _go_readonly(r);
7c673cae
FG
271 return;
272 }
273
274 if (journaler.get_read_pos() == journaler.get_write_pos()) {
275 dout(4) << "write_pos recovered" << dendl;
276 // restore original read_pos
277 journaler.set_read_pos(journaler.last_committed.expire_pos);
278 journaler.set_writeable();
3efd9988
FG
279 recovered = true;
280 finish_contexts(g_ceph_context, waiting_for_recovery);
7c673cae
FG
281 return;
282 }
283
284 bufferlist bl;
285 bool readable = journaler.try_read_entry(bl);
11fdf7f2 286 ceph_assert(readable); // we checked earlier
7c673cae
FG
287 }
288}
289
290void PurgeQueue::create(Context *fin)
291{
292 dout(4) << "creating" << dendl;
11fdf7f2 293 std::lock_guard l(lock);
7c673cae 294
3efd9988
FG
295 if (fin)
296 waiting_for_recovery.push_back(fin);
297
7c673cae
FG
298 file_layout_t layout = file_layout_t::get_default();
299 layout.pool_id = metadata_pool;
300 journaler.set_writeable();
301 journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
9f95a23c 302 journaler.write_head(new LambdaContext([this](int r) {
11fdf7f2 303 std::lock_guard l(lock);
f64942e4
AA
304 if (r) {
305 _go_readonly(r);
306 } else {
307 recovered = true;
308 finish_contexts(g_ceph_context, waiting_for_recovery);
309 }
3efd9988 310 }));
7c673cae
FG
311}
312
313/**
314 * The `completion` context will always be called back via a Finisher
315 */
316void PurgeQueue::push(const PurgeItem &pi, Context *completion)
317{
11fdf7f2
TL
318 dout(4) << "pushing inode " << pi.ino << dendl;
319 std::lock_guard l(lock);
7c673cae 320
f64942e4
AA
321 if (readonly) {
322 dout(10) << "cannot push inode: PurgeQueue is readonly" << dendl;
323 completion->complete(-EROFS);
324 return;
325 }
326
7c673cae 327 // Callers should have waited for open() before using us
11fdf7f2 328 ceph_assert(!journaler.is_readonly());
7c673cae
FG
329
330 bufferlist bl;
331
11fdf7f2 332 encode(pi, bl);
7c673cae
FG
333 journaler.append_entry(bl);
334 journaler.wait_for_flush(completion);
335
336 // Maybe go ahead and do something with it right away
337 bool could_consume = _consume();
338 if (!could_consume) {
339 // Usually, it is not necessary to explicitly flush here, because the reader
340 // will get flushes generated inside Journaler::is_readable. However,
f64942e4 341 // if we remain in a _can_consume()==false state for a long period then
7c673cae
FG
342 // we should flush in order to allow MDCache to drop its strays rather
343 // than having them wait for purgequeue to progress.
344 if (!delayed_flush) {
9f95a23c 345 delayed_flush = new LambdaContext([this](int r){
7c673cae
FG
346 delayed_flush = nullptr;
347 journaler.flush();
348 });
349
350 timer.add_event_after(
11fdf7f2 351 g_conf()->mds_purge_queue_busy_flush_period,
7c673cae
FG
352 delayed_flush);
353 }
354 }
355}
356
357uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const
358{
359 uint32_t ops_required = 0;
360 if (item.action == PurgeItem::PURGE_DIR) {
361 // Directory, count dirfrags to be deleted
11fdf7f2 362 frag_vec_t leaves;
7c673cae 363 if (!item.fragtree.is_leaf(frag_t())) {
11fdf7f2 364 item.fragtree.get_leaves(leaves);
7c673cae
FG
365 }
366 // One for the root, plus any leaves
11fdf7f2 367 ops_required = 1 + leaves.size();
7c673cae
FG
368 } else {
369 // File, work out concurrent Filer::purge deletes
370 const uint64_t num = (item.size > 0) ?
371 Striper::get_num_objects(item.layout, item.size) : 1;
372
11fdf7f2 373 ops_required = std::min(num, g_conf()->filer_max_purge_ops);
7c673cae
FG
374
375 // Account for removing (or zeroing) backtrace
376 ops_required += 1;
377
378 // Account for deletions for old pools
379 if (item.action != PurgeItem::TRUNCATE_FILE) {
380 ops_required += item.old_pools.size();
381 }
382 }
383
384 return ops_required;
385}
386
f64942e4 387bool PurgeQueue::_can_consume()
7c673cae 388{
f64942e4
AA
389 if (readonly) {
390 dout(10) << "can't consume: PurgeQueue is readonly" << dendl;
391 return false;
392 }
393
7c673cae 394 dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, "
11fdf7f2 395 << in_flight.size() << "/" << g_conf()->mds_max_purge_files
7c673cae
FG
396 << " files" << dendl;
397
398 if (in_flight.size() == 0 && cct->_conf->mds_max_purge_files > 0) {
399 // Always permit consumption if nothing is in flight, so that the ops
400 // limit can never be so low as to forbid all progress (unless
401 // administrator has deliberately paused purging by setting max
402 // purge files to zero).
403 return true;
404 }
405
406 if (ops_in_flight >= max_purge_ops) {
407 dout(20) << "Throttling on op limit " << ops_in_flight << "/"
408 << max_purge_ops << dendl;
409 return false;
410 }
411
412 if (in_flight.size() >= cct->_conf->mds_max_purge_files) {
413 dout(20) << "Throttling on item limit " << in_flight.size()
414 << "/" << cct->_conf->mds_max_purge_files << dendl;
415 return false;
416 } else {
417 return true;
418 }
419}
420
f64942e4
AA
421void PurgeQueue::_go_readonly(int r)
422{
423 if (readonly) return;
424 dout(1) << "going readonly because internal IO failed: " << strerror(-r) << dendl;
425 readonly = true;
9f95a23c 426 finisher.queue(on_error, r);
f64942e4
AA
427 on_error = nullptr;
428 journaler.set_readonly();
429 finish_contexts(g_ceph_context, waiting_for_recovery, r);
430}
431
7c673cae
FG
432bool PurgeQueue::_consume()
433{
9f95a23c 434 ceph_assert(ceph_mutex_is_locked_by_me(lock));
7c673cae
FG
435
436 bool could_consume = false;
f64942e4 437 while(_can_consume()) {
7c673cae
FG
438
439 if (delayed_flush) {
440 // We are now going to read from the journal, so any proactive
441 // flush is no longer necessary. This is not functionally necessary
442 // but it can avoid generating extra fragmented flush IOs.
443 timer.cancel_event(delayed_flush);
444 delayed_flush = nullptr;
445 }
446
1adf2230
AA
447 if (int r = journaler.get_error()) {
448 derr << "Error " << r << " recovering write_pos" << dendl;
f64942e4 449 _go_readonly(r);
1adf2230
AA
450 return could_consume;
451 }
452
7c673cae
FG
453 if (!journaler.is_readable()) {
454 dout(10) << " not readable right now" << dendl;
455 // Because we are the writer and the reader of the journal
456 // via the same Journaler instance, we never need to reread_head
457 if (!journaler.have_waiter()) {
9f95a23c 458 journaler.wait_for_readable(new LambdaContext([this](int r) {
11fdf7f2 459 std::lock_guard l(lock);
7c673cae
FG
460 if (r == 0) {
461 _consume();
1adf2230 462 } else if (r != -EAGAIN) {
f64942e4 463 _go_readonly(r);
7c673cae
FG
464 }
465 }));
466 }
467
468 return could_consume;
469 }
470
28e407b8 471 could_consume = true;
7c673cae
FG
472 // The journaler is readable: consume an entry
473 bufferlist bl;
474 bool readable = journaler.try_read_entry(bl);
11fdf7f2 475 ceph_assert(readable); // we checked earlier
7c673cae
FG
476
477 dout(20) << " decoding entry" << dendl;
478 PurgeItem item;
11fdf7f2 479 auto q = bl.cbegin();
7c673cae 480 try {
11fdf7f2 481 decode(item, q);
7c673cae
FG
482 } catch (const buffer::error &err) {
483 derr << "Decode error at read_pos=0x" << std::hex
484 << journaler.get_read_pos() << dendl;
f64942e4 485 _go_readonly(EIO);
7c673cae 486 }
11fdf7f2 487 dout(20) << " executing item (" << item.ino << ")" << dendl;
7c673cae
FG
488 _execute_item(item, journaler.get_read_pos());
489 }
490
491 dout(10) << " cannot consume right now" << dendl;
492
493 return could_consume;
494}
495
496void PurgeQueue::_execute_item(
497 const PurgeItem &item,
498 uint64_t expire_to)
499{
9f95a23c 500 ceph_assert(ceph_mutex_is_locked_by_me(lock));
7c673cae
FG
501
502 in_flight[expire_to] = item;
503 logger->set(l_pq_executing, in_flight.size());
92f5a8d4
TL
504 files_high_water = std::max(files_high_water, in_flight.size());
505 logger->set(l_pq_executing_high_water, files_high_water);
f64942e4
AA
506 auto ops = _calculate_ops(item);
507 ops_in_flight += ops;
7c673cae 508 logger->set(l_pq_executing_ops, ops_in_flight);
92f5a8d4
TL
509 ops_high_water = std::max(ops_high_water, ops_in_flight);
510 logger->set(l_pq_executing_ops_high_water, ops_high_water);
7c673cae
FG
511
512 SnapContext nullsnapc;
513
514 C_GatherBuilder gather(cct);
515 if (item.action == PurgeItem::PURGE_FILE) {
516 if (item.size > 0) {
517 uint64_t num = Striper::get_num_objects(item.layout, item.size);
518 dout(10) << " 0~" << item.size << " objects 0~" << num
519 << " snapc " << item.snapc << " on " << item.ino << dendl;
520 filer.purge_range(item.ino, &item.layout, item.snapc,
521 0, num, ceph::real_clock::now(), 0,
522 gather.new_sub());
523 }
524
525 // remove the backtrace object if it was not purged
526 object_t oid = CInode::get_object_name(item.ino, frag_t(), "");
527 if (!gather.has_subs() || !item.layout.pool_ns.empty()) {
528 object_locator_t oloc(item.layout.pool_id);
529 dout(10) << " remove backtrace object " << oid
530 << " pool " << oloc.pool << " snapc " << item.snapc << dendl;
531 objecter->remove(oid, oloc, item.snapc,
532 ceph::real_clock::now(), 0,
533 gather.new_sub());
534 }
535
536 // remove old backtrace objects
537 for (const auto &p : item.old_pools) {
538 object_locator_t oloc(p);
539 dout(10) << " remove backtrace object " << oid
540 << " old pool " << p << " snapc " << item.snapc << dendl;
541 objecter->remove(oid, oloc, item.snapc,
542 ceph::real_clock::now(), 0,
543 gather.new_sub());
544 }
545 } else if (item.action == PurgeItem::PURGE_DIR) {
546 object_locator_t oloc(metadata_pool);
11fdf7f2 547 frag_vec_t leaves;
7c673cae 548 if (!item.fragtree.is_leaf(frag_t()))
11fdf7f2
TL
549 item.fragtree.get_leaves(leaves);
550 leaves.push_back(frag_t());
551 for (const auto &leaf : leaves) {
552 object_t oid = CInode::get_object_name(item.ino, leaf, "");
7c673cae
FG
553 dout(10) << " remove dirfrag " << oid << dendl;
554 objecter->remove(oid, oloc, nullsnapc,
555 ceph::real_clock::now(),
556 0, gather.new_sub());
557 }
558 } else if (item.action == PurgeItem::TRUNCATE_FILE) {
559 const uint64_t num = Striper::get_num_objects(item.layout, item.size);
560 dout(10) << " 0~" << item.size << " objects 0~" << num
561 << " snapc " << item.snapc << " on " << item.ino << dendl;
562
563 // keep backtrace object
564 if (num > 1) {
565 filer.purge_range(item.ino, &item.layout, item.snapc,
566 1, num - 1, ceph::real_clock::now(),
567 0, gather.new_sub());
568 }
569 filer.zero(item.ino, &item.layout, item.snapc,
570 0, item.layout.object_size,
571 ceph::real_clock::now(),
572 0, true, gather.new_sub());
573 } else {
574 derr << "Invalid item (action=" << item.action << ") in purge queue, "
575 "dropping it" << dendl;
f64942e4
AA
576 ops_in_flight -= ops;
577 logger->set(l_pq_executing_ops, ops_in_flight);
92f5a8d4
TL
578 ops_high_water = std::max(ops_high_water, ops_in_flight);
579 logger->set(l_pq_executing_ops_high_water, ops_high_water);
7c673cae
FG
580 in_flight.erase(expire_to);
581 logger->set(l_pq_executing, in_flight.size());
92f5a8d4
TL
582 files_high_water = std::max(files_high_water, in_flight.size());
583 logger->set(l_pq_executing_high_water, files_high_water);
7c673cae
FG
584 return;
585 }
11fdf7f2 586 ceph_assert(gather.has_subs());
7c673cae 587
31f18b77 588 gather.set_finisher(new C_OnFinisher(
9f95a23c 589 new LambdaContext([this, expire_to](int r){
11fdf7f2 590 std::lock_guard l(lock);
7c673cae 591
e306af50
TL
592 if (r == -EBLACKLISTED) {
593 finisher.queue(on_error, r);
594 on_error = nullptr;
595 return;
596 }
597
598 _execute_item_complete(expire_to);
31f18b77 599 _consume();
7c673cae
FG
600
601 // Have we gone idle? If so, do an extra write_head now instead of
602 // waiting for next flush after journaler_write_head_interval.
603 // Also do this periodically even if not idle, so that the persisted
604 // expire_pos doesn't fall too far behind our progress when consuming
605 // a very long queue.
9f95a23c
TL
606 if (!readonly &&
607 (in_flight.empty() || journaler.write_head_needed())) {
f64942e4 608 journaler.write_head(nullptr);
7c673cae 609 }
31f18b77
FG
610 }), &finisher));
611
7c673cae
FG
612 gather.activate();
613}
614
615void PurgeQueue::_execute_item_complete(
616 uint64_t expire_to)
617{
9f95a23c 618 ceph_assert(ceph_mutex_is_locked_by_me(lock));
7c673cae 619 dout(10) << "complete at 0x" << std::hex << expire_to << std::dec << dendl;
11fdf7f2 620 ceph_assert(in_flight.count(expire_to) == 1);
7c673cae
FG
621
622 auto iter = in_flight.find(expire_to);
11fdf7f2 623 ceph_assert(iter != in_flight.end());
7c673cae 624 if (iter == in_flight.begin()) {
11fdf7f2
TL
625 uint64_t pos = expire_to;
626 if (!pending_expire.empty()) {
627 auto n = iter;
628 ++n;
629 if (n == in_flight.end()) {
630 pos = *pending_expire.rbegin();
631 pending_expire.clear();
632 } else {
633 auto p = pending_expire.begin();
634 do {
635 if (*p >= n->first)
636 break;
637 pos = *p;
638 pending_expire.erase(p++);
639 } while (p != pending_expire.end());
640 }
641 }
642 dout(10) << "expiring to 0x" << std::hex << pos << std::dec << dendl;
643 journaler.set_expire_pos(pos);
7c673cae
FG
644 } else {
645 // This is completely fine, we're not supposed to purge files in
646 // order when doing them in parallel.
647 dout(10) << "non-sequential completion, not expiring anything" << dendl;
11fdf7f2 648 pending_expire.insert(expire_to);
7c673cae
FG
649 }
650
651 ops_in_flight -= _calculate_ops(iter->second);
652 logger->set(l_pq_executing_ops, ops_in_flight);
92f5a8d4
TL
653 ops_high_water = std::max(ops_high_water, ops_in_flight);
654 logger->set(l_pq_executing_ops_high_water, ops_high_water);
7c673cae 655
11fdf7f2 656 dout(10) << "completed item for ino " << iter->second.ino << dendl;
7c673cae
FG
657
658 in_flight.erase(iter);
659 logger->set(l_pq_executing, in_flight.size());
92f5a8d4
TL
660 files_high_water = std::max(files_high_water, in_flight.size());
661 logger->set(l_pq_executing_high_water, files_high_water);
7c673cae
FG
662 dout(10) << "in_flight.size() now " << in_flight.size() << dendl;
663
9f95a23c
TL
664 uint64_t write_pos = journaler.get_write_pos();
665 uint64_t read_pos = journaler.get_read_pos();
666 uint64_t expire_pos = journaler.get_expire_pos();
667 uint64_t item_num = (write_pos - (in_flight.size() ? expire_pos : read_pos))
668 / purge_item_journal_size;
669 dout(10) << "left purge items in journal: " << item_num
670 << " (purge_item_journal_size/write_pos/read_pos/expire_pos) now at "
671 << "(" << purge_item_journal_size << "/" << write_pos << "/" << read_pos
672 << "/" << expire_pos << ")" << dendl;
673
674 logger->set(l_pq_item_in_journal, item_num);
7c673cae
FG
675 logger->inc(l_pq_executed);
676}
677
678void PurgeQueue::update_op_limit(const MDSMap &mds_map)
679{
11fdf7f2 680 std::lock_guard l(lock);
7c673cae 681
f64942e4
AA
682 if (readonly) {
683 dout(10) << "skipping; PurgeQueue is readonly" << dendl;
684 return;
685 }
686
7c673cae
FG
687 uint64_t pg_count = 0;
688 objecter->with_osdmap([&](const OSDMap& o) {
689 // Number of PGs across all data pools
31f18b77 690 const std::vector<int64_t> &data_pools = mds_map.get_data_pools();
7c673cae
FG
691 for (const auto dp : data_pools) {
692 if (o.get_pg_pool(dp) == NULL) {
693 // It is possible that we have an older OSDMap than MDSMap,
694 // because we don't start watching every OSDMap until after
695 // MDSRank is initialized
696 dout(4) << " data pool " << dp << " not found in OSDMap" << dendl;
697 continue;
698 }
699 pg_count += o.get_pg_num(dp);
700 }
701 });
702
703 // Work out a limit based on n_pgs / n_mdss, multiplied by the user's
704 // preference for how many ops per PG
705 max_purge_ops = uint64_t(((double)pg_count / (double)mds_map.get_max_mds()) *
706 cct->_conf->mds_max_purge_ops_per_pg);
707
708 // User may also specify a hard limit, apply this if so.
709 if (cct->_conf->mds_max_purge_ops) {
11fdf7f2 710 max_purge_ops = std::min(max_purge_ops, cct->_conf->mds_max_purge_ops);
7c673cae
FG
711 }
712}
713
92f5a8d4 714void PurgeQueue::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
7c673cae
FG
715{
716 if (changed.count("mds_max_purge_ops")
717 || changed.count("mds_max_purge_ops_per_pg")) {
718 update_op_limit(mds_map);
719 } else if (changed.count("mds_max_purge_files")) {
11fdf7f2 720 std::lock_guard l(lock);
7c673cae
FG
721 if (in_flight.empty()) {
722 // We might have gone from zero to a finite limit, so
723 // might need to kick off consume.
724 dout(4) << "maybe start work again (max_purge_files="
92f5a8d4 725 << g_conf()->mds_max_purge_files << dendl;
9f95a23c 726 finisher.queue(new LambdaContext([this](int r){
11fdf7f2 727 std::lock_guard l(lock);
7c673cae
FG
728 _consume();
729 }));
730 }
731 }
732}
733
734bool PurgeQueue::drain(
735 uint64_t *progress,
736 uint64_t *progress_total,
737 size_t *in_flight_count
738 )
739{
11fdf7f2 740 std::lock_guard l(lock);
f64942e4
AA
741
742 if (readonly) {
743 dout(10) << "skipping drain; PurgeQueue is readonly" << dendl;
744 return true;
745 }
746
11fdf7f2
TL
747 ceph_assert(progress != nullptr);
748 ceph_assert(progress_total != nullptr);
749 ceph_assert(in_flight_count != nullptr);
7c673cae
FG
750
751 const bool done = in_flight.empty() && (
752 journaler.get_read_pos() == journaler.get_write_pos());
753 if (done) {
754 return true;
755 }
756
757 const uint64_t bytes_remaining = journaler.get_write_pos()
758 - journaler.get_read_pos();
759
760 if (!draining) {
761 // Start of draining: remember how much there was outstanding at
762 // this point so that we can give a progress percentage later
763 draining = true;
764
765 // Life the op throttle as this daemon now has nothing to do but
766 // drain the purge queue, so do it as fast as we can.
767 max_purge_ops = 0xffff;
768 }
769
11fdf7f2 770 drain_initial = std::max(bytes_remaining, drain_initial);
7c673cae
FG
771
772 *progress = drain_initial - bytes_remaining;
773 *progress_total = drain_initial;
774 *in_flight_count = in_flight.size();
775
776 return false;
777}
778
11fdf7f2
TL
779std::string_view PurgeItem::get_type_str() const
780{
781 switch(action) {
782 case PurgeItem::NONE: return "NONE";
783 case PurgeItem::PURGE_FILE: return "PURGE_FILE";
784 case PurgeItem::PURGE_DIR: return "PURGE_DIR";
785 case PurgeItem::TRUNCATE_FILE: return "TRUNCATE_FILE";
786 default:
787 return "UNKNOWN";
788 }
789}
790