1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2015 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "common/debug.h"
16 #include "mds/mdstypes.h"
17 #include "mds/CInode.h"
18 #include "mds/MDCache.h"
20 #include "PurgeQueue.h"
24 #define dout_context cct
25 #define dout_subsys ceph_subsys_mds
27 #define dout_prefix _prefix(_dout, rank) << __func__ << ": "
28 static ostream
& _prefix(std::ostream
*_dout
, mds_rank_t rank
) {
29 return *_dout
<< "mds." << rank
<< ".purge_queue ";
32 const std::map
<std::string
, PurgeItem::Action
> PurgeItem::actions
= {
33 {"NONE", PurgeItem::NONE
},
34 {"PURGE_FILE", PurgeItem::PURGE_FILE
},
35 {"TRUNCATE_FILE", PurgeItem::TRUNCATE_FILE
},
36 {"PURGE_DIR", PurgeItem::PURGE_DIR
}
39 void PurgeItem::encode(bufferlist
&bl
) const
41 ENCODE_START(2, 1, bl
);
42 encode((uint8_t)action
, bl
);
45 encode(layout
, bl
, CEPH_FEATURE_FS_FILE_LAYOUT_V2
);
46 encode(old_pools
, bl
);
50 uint8_t static const pad
= 0xff;
51 for (unsigned int i
= 0; i
<pad_size
; i
++) {
57 void PurgeItem::decode(bufferlist::const_iterator
&p
)
60 decode((uint8_t&)action
, p
);
73 // TODO: if Objecter has any slow requests, take that as a hint and
74 // slow down our rate of purging (keep accepting pushes though)
75 PurgeQueue::PurgeQueue(
78 const int64_t metadata_pool_
,
85 metadata_pool(metadata_pool_
),
86 finisher(cct
, "PurgeQueue", "PQ_Finisher"),
88 filer(objecter_
, &finisher
),
90 journaler("pq", MDS_INO_PURGE_QUEUE
+ rank
, metadata_pool
,
91 CEPH_FS_ONDISK_MAGIC
, objecter_
, nullptr, 0,
98 delayed_flush(nullptr),
101 ceph_assert(cct
!= nullptr);
102 ceph_assert(on_error
!= nullptr);
103 ceph_assert(objecter
!= nullptr);
104 journaler
.set_write_error_handler(on_error
);
107 PurgeQueue::~PurgeQueue()
110 g_ceph_context
->get_perfcounters_collection()->remove(logger
.get());
115 void PurgeQueue::create_logger()
117 PerfCountersBuilder
pcb(g_ceph_context
, "purge_queue", l_pq_first
, l_pq_last
);
119 pcb
.add_u64_counter(l_pq_executed
, "pq_executed", "Purge queue tasks executed",
120 "purg", PerfCountersBuilder::PRIO_INTERESTING
);
122 pcb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
123 pcb
.add_u64(l_pq_executing_ops
, "pq_executing_ops", "Purge queue ops in flight");
124 pcb
.add_u64(l_pq_executing_ops_high_water
, "pq_executing_ops_high_water", "Maximum number of executing file purge ops");
125 pcb
.add_u64(l_pq_executing
, "pq_executing", "Purge queue tasks in flight");
126 pcb
.add_u64(l_pq_executing_high_water
, "pq_executing_high_water", "Maximum number of executing file purges");
128 logger
.reset(pcb
.create_perf_counters());
129 g_ceph_context
->get_perfcounters_collection()->add(logger
.get());
132 void PurgeQueue::init()
134 std::lock_guard
l(lock
);
136 ceph_assert(logger
!= nullptr);
142 void PurgeQueue::activate()
144 std::lock_guard
l(lock
);
147 dout(10) << "skipping activate: PurgeQueue is readonly" << dendl
;
151 if (journaler
.get_read_pos() == journaler
.get_write_pos())
154 if (in_flight
.empty()) {
155 dout(4) << "start work (by drain)" << dendl
;
156 finisher
.queue(new FunctionContext([this](int r
) {
157 std::lock_guard
l(lock
);
163 void PurgeQueue::shutdown()
165 std::lock_guard
l(lock
);
167 journaler
.shutdown();
172 void PurgeQueue::open(Context
*completion
)
174 dout(4) << "opening" << dendl
;
176 std::lock_guard
l(lock
);
179 waiting_for_recovery
.push_back(completion
);
181 journaler
.recover(new FunctionContext([this](int r
){
183 dout(1) << "Purge Queue not found, assuming this is an upgrade and "
184 "creating it." << dendl
;
187 std::lock_guard
l(lock
);
188 dout(4) << "open complete" << dendl
;
190 // Journaler only guarantees entries before head write_pos have been
191 // fully flushed. Before appending new entries, we need to find and
192 // drop any partial written entry.
193 if (journaler
.last_committed
.write_pos
< journaler
.get_write_pos()) {
194 dout(4) << "recovering write_pos" << dendl
;
195 journaler
.set_read_pos(journaler
.last_committed
.write_pos
);
200 journaler
.set_writeable();
202 finish_contexts(g_ceph_context
, waiting_for_recovery
);
204 derr
<< "Error " << r
<< " loading Journaler" << dendl
;
210 void PurgeQueue::wait_for_recovery(Context
* c
)
212 std::lock_guard
l(lock
);
215 } else if (readonly
) {
216 dout(10) << "cannot wait for recovery: PurgeQueue is readonly" << dendl
;
219 waiting_for_recovery
.push_back(c
);
223 void PurgeQueue::_recover()
225 ceph_assert(lock
.is_locked_by_me());
227 // Journaler::is_readable() adjusts write_pos if partial entry is encountered
229 if (!journaler
.is_readable() &&
230 !journaler
.get_error() &&
231 journaler
.get_read_pos() < journaler
.get_write_pos()) {
232 journaler
.wait_for_readable(new FunctionContext([this](int r
) {
233 std::lock_guard
l(lock
);
239 if (journaler
.get_error()) {
240 int r
= journaler
.get_error();
241 derr
<< "Error " << r
<< " recovering write_pos" << dendl
;
246 if (journaler
.get_read_pos() == journaler
.get_write_pos()) {
247 dout(4) << "write_pos recovered" << dendl
;
248 // restore original read_pos
249 journaler
.set_read_pos(journaler
.last_committed
.expire_pos
);
250 journaler
.set_writeable();
252 finish_contexts(g_ceph_context
, waiting_for_recovery
);
257 bool readable
= journaler
.try_read_entry(bl
);
258 ceph_assert(readable
); // we checked earlier
262 void PurgeQueue::create(Context
*fin
)
264 dout(4) << "creating" << dendl
;
265 std::lock_guard
l(lock
);
268 waiting_for_recovery
.push_back(fin
);
270 file_layout_t layout
= file_layout_t::get_default();
271 layout
.pool_id
= metadata_pool
;
272 journaler
.set_writeable();
273 journaler
.create(&layout
, JOURNAL_FORMAT_RESILIENT
);
274 journaler
.write_head(new FunctionContext([this](int r
) {
275 std::lock_guard
l(lock
);
280 finish_contexts(g_ceph_context
, waiting_for_recovery
);
286 * The `completion` context will always be called back via a Finisher
288 void PurgeQueue::push(const PurgeItem
&pi
, Context
*completion
)
290 dout(4) << "pushing inode " << pi
.ino
<< dendl
;
291 std::lock_guard
l(lock
);
294 dout(10) << "cannot push inode: PurgeQueue is readonly" << dendl
;
295 completion
->complete(-EROFS
);
299 // Callers should have waited for open() before using us
300 ceph_assert(!journaler
.is_readonly());
305 journaler
.append_entry(bl
);
306 journaler
.wait_for_flush(completion
);
308 // Maybe go ahead and do something with it right away
309 bool could_consume
= _consume();
310 if (!could_consume
) {
311 // Usually, it is not necessary to explicitly flush here, because the reader
312 // will get flushes generated inside Journaler::is_readable. However,
313 // if we remain in a _can_consume()==false state for a long period then
314 // we should flush in order to allow MDCache to drop its strays rather
315 // than having them wait for purgequeue to progress.
316 if (!delayed_flush
) {
317 delayed_flush
= new FunctionContext([this](int r
){
318 delayed_flush
= nullptr;
322 timer
.add_event_after(
323 g_conf()->mds_purge_queue_busy_flush_period
,
329 uint32_t PurgeQueue::_calculate_ops(const PurgeItem
&item
) const
331 uint32_t ops_required
= 0;
332 if (item
.action
== PurgeItem::PURGE_DIR
) {
333 // Directory, count dirfrags to be deleted
335 if (!item
.fragtree
.is_leaf(frag_t())) {
336 item
.fragtree
.get_leaves(leaves
);
338 // One for the root, plus any leaves
339 ops_required
= 1 + leaves
.size();
341 // File, work out concurrent Filer::purge deletes
342 const uint64_t num
= (item
.size
> 0) ?
343 Striper::get_num_objects(item
.layout
, item
.size
) : 1;
345 ops_required
= std::min(num
, g_conf()->filer_max_purge_ops
);
347 // Account for removing (or zeroing) backtrace
350 // Account for deletions for old pools
351 if (item
.action
!= PurgeItem::TRUNCATE_FILE
) {
352 ops_required
+= item
.old_pools
.size();
359 bool PurgeQueue::_can_consume()
362 dout(10) << "can't consume: PurgeQueue is readonly" << dendl
;
366 dout(20) << ops_in_flight
<< "/" << max_purge_ops
<< " ops, "
367 << in_flight
.size() << "/" << g_conf()->mds_max_purge_files
368 << " files" << dendl
;
370 if (in_flight
.size() == 0 && cct
->_conf
->mds_max_purge_files
> 0) {
371 // Always permit consumption if nothing is in flight, so that the ops
372 // limit can never be so low as to forbid all progress (unless
373 // administrator has deliberately paused purging by setting max
374 // purge files to zero).
378 if (ops_in_flight
>= max_purge_ops
) {
379 dout(20) << "Throttling on op limit " << ops_in_flight
<< "/"
380 << max_purge_ops
<< dendl
;
384 if (in_flight
.size() >= cct
->_conf
->mds_max_purge_files
) {
385 dout(20) << "Throttling on item limit " << in_flight
.size()
386 << "/" << cct
->_conf
->mds_max_purge_files
<< dendl
;
393 void PurgeQueue::_go_readonly(int r
)
395 if (readonly
) return;
396 dout(1) << "going readonly because internal IO failed: " << strerror(-r
) << dendl
;
398 on_error
->complete(r
);
400 journaler
.set_readonly();
401 finish_contexts(g_ceph_context
, waiting_for_recovery
, r
);
404 bool PurgeQueue::_consume()
406 ceph_assert(lock
.is_locked_by_me());
408 bool could_consume
= false;
409 while(_can_consume()) {
412 // We are now going to read from the journal, so any proactive
413 // flush is no longer necessary. This is not functionally necessary
414 // but it can avoid generating extra fragmented flush IOs.
415 timer
.cancel_event(delayed_flush
);
416 delayed_flush
= nullptr;
419 if (int r
= journaler
.get_error()) {
420 derr
<< "Error " << r
<< " recovering write_pos" << dendl
;
422 return could_consume
;
425 if (!journaler
.is_readable()) {
426 dout(10) << " not readable right now" << dendl
;
427 // Because we are the writer and the reader of the journal
428 // via the same Journaler instance, we never need to reread_head
429 if (!journaler
.have_waiter()) {
430 journaler
.wait_for_readable(new FunctionContext([this](int r
) {
431 std::lock_guard
l(lock
);
434 } else if (r
!= -EAGAIN
) {
440 return could_consume
;
443 could_consume
= true;
444 // The journaler is readable: consume an entry
446 bool readable
= journaler
.try_read_entry(bl
);
447 ceph_assert(readable
); // we checked earlier
449 dout(20) << " decoding entry" << dendl
;
451 auto q
= bl
.cbegin();
454 } catch (const buffer::error
&err
) {
455 derr
<< "Decode error at read_pos=0x" << std::hex
456 << journaler
.get_read_pos() << dendl
;
459 dout(20) << " executing item (" << item
.ino
<< ")" << dendl
;
460 _execute_item(item
, journaler
.get_read_pos());
463 dout(10) << " cannot consume right now" << dendl
;
465 return could_consume
;
468 void PurgeQueue::_execute_item(
469 const PurgeItem
&item
,
472 ceph_assert(lock
.is_locked_by_me());
474 in_flight
[expire_to
] = item
;
475 logger
->set(l_pq_executing
, in_flight
.size());
476 files_high_water
= std::max(files_high_water
, in_flight
.size());
477 logger
->set(l_pq_executing_high_water
, files_high_water
);
478 auto ops
= _calculate_ops(item
);
479 ops_in_flight
+= ops
;
480 logger
->set(l_pq_executing_ops
, ops_in_flight
);
481 ops_high_water
= std::max(ops_high_water
, ops_in_flight
);
482 logger
->set(l_pq_executing_ops_high_water
, ops_high_water
);
484 SnapContext nullsnapc
;
486 C_GatherBuilder
gather(cct
);
487 if (item
.action
== PurgeItem::PURGE_FILE
) {
489 uint64_t num
= Striper::get_num_objects(item
.layout
, item
.size
);
490 dout(10) << " 0~" << item
.size
<< " objects 0~" << num
491 << " snapc " << item
.snapc
<< " on " << item
.ino
<< dendl
;
492 filer
.purge_range(item
.ino
, &item
.layout
, item
.snapc
,
493 0, num
, ceph::real_clock::now(), 0,
497 // remove the backtrace object if it was not purged
498 object_t oid
= CInode::get_object_name(item
.ino
, frag_t(), "");
499 if (!gather
.has_subs() || !item
.layout
.pool_ns
.empty()) {
500 object_locator_t
oloc(item
.layout
.pool_id
);
501 dout(10) << " remove backtrace object " << oid
502 << " pool " << oloc
.pool
<< " snapc " << item
.snapc
<< dendl
;
503 objecter
->remove(oid
, oloc
, item
.snapc
,
504 ceph::real_clock::now(), 0,
508 // remove old backtrace objects
509 for (const auto &p
: item
.old_pools
) {
510 object_locator_t
oloc(p
);
511 dout(10) << " remove backtrace object " << oid
512 << " old pool " << p
<< " snapc " << item
.snapc
<< dendl
;
513 objecter
->remove(oid
, oloc
, item
.snapc
,
514 ceph::real_clock::now(), 0,
517 } else if (item
.action
== PurgeItem::PURGE_DIR
) {
518 object_locator_t
oloc(metadata_pool
);
520 if (!item
.fragtree
.is_leaf(frag_t()))
521 item
.fragtree
.get_leaves(leaves
);
522 leaves
.push_back(frag_t());
523 for (const auto &leaf
: leaves
) {
524 object_t oid
= CInode::get_object_name(item
.ino
, leaf
, "");
525 dout(10) << " remove dirfrag " << oid
<< dendl
;
526 objecter
->remove(oid
, oloc
, nullsnapc
,
527 ceph::real_clock::now(),
528 0, gather
.new_sub());
530 } else if (item
.action
== PurgeItem::TRUNCATE_FILE
) {
531 const uint64_t num
= Striper::get_num_objects(item
.layout
, item
.size
);
532 dout(10) << " 0~" << item
.size
<< " objects 0~" << num
533 << " snapc " << item
.snapc
<< " on " << item
.ino
<< dendl
;
535 // keep backtrace object
537 filer
.purge_range(item
.ino
, &item
.layout
, item
.snapc
,
538 1, num
- 1, ceph::real_clock::now(),
539 0, gather
.new_sub());
541 filer
.zero(item
.ino
, &item
.layout
, item
.snapc
,
542 0, item
.layout
.object_size
,
543 ceph::real_clock::now(),
544 0, true, gather
.new_sub());
546 derr
<< "Invalid item (action=" << item
.action
<< ") in purge queue, "
547 "dropping it" << dendl
;
548 ops_in_flight
-= ops
;
549 logger
->set(l_pq_executing_ops
, ops_in_flight
);
550 ops_high_water
= std::max(ops_high_water
, ops_in_flight
);
551 logger
->set(l_pq_executing_ops_high_water
, ops_high_water
);
552 in_flight
.erase(expire_to
);
553 logger
->set(l_pq_executing
, in_flight
.size());
554 files_high_water
= std::max(files_high_water
, in_flight
.size());
555 logger
->set(l_pq_executing_high_water
, files_high_water
);
558 ceph_assert(gather
.has_subs());
560 gather
.set_finisher(new C_OnFinisher(
561 new FunctionContext([this, expire_to
](int r
){
562 std::lock_guard
l(lock
);
563 _execute_item_complete(expire_to
);
567 // Have we gone idle? If so, do an extra write_head now instead of
568 // waiting for next flush after journaler_write_head_interval.
569 // Also do this periodically even if not idle, so that the persisted
570 // expire_pos doesn't fall too far behind our progress when consuming
571 // a very long queue.
572 if (in_flight
.empty() || journaler
.write_head_needed()) {
573 journaler
.write_head(nullptr);
580 void PurgeQueue::_execute_item_complete(
583 ceph_assert(lock
.is_locked_by_me());
584 dout(10) << "complete at 0x" << std::hex
<< expire_to
<< std::dec
<< dendl
;
585 ceph_assert(in_flight
.count(expire_to
) == 1);
587 auto iter
= in_flight
.find(expire_to
);
588 ceph_assert(iter
!= in_flight
.end());
589 if (iter
== in_flight
.begin()) {
590 uint64_t pos
= expire_to
;
591 if (!pending_expire
.empty()) {
594 if (n
== in_flight
.end()) {
595 pos
= *pending_expire
.rbegin();
596 pending_expire
.clear();
598 auto p
= pending_expire
.begin();
603 pending_expire
.erase(p
++);
604 } while (p
!= pending_expire
.end());
607 dout(10) << "expiring to 0x" << std::hex
<< pos
<< std::dec
<< dendl
;
608 journaler
.set_expire_pos(pos
);
610 // This is completely fine, we're not supposed to purge files in
611 // order when doing them in parallel.
612 dout(10) << "non-sequential completion, not expiring anything" << dendl
;
613 pending_expire
.insert(expire_to
);
616 ops_in_flight
-= _calculate_ops(iter
->second
);
617 logger
->set(l_pq_executing_ops
, ops_in_flight
);
618 ops_high_water
= std::max(ops_high_water
, ops_in_flight
);
619 logger
->set(l_pq_executing_ops_high_water
, ops_high_water
);
621 dout(10) << "completed item for ino " << iter
->second
.ino
<< dendl
;
623 in_flight
.erase(iter
);
624 logger
->set(l_pq_executing
, in_flight
.size());
625 files_high_water
= std::max(files_high_water
, in_flight
.size());
626 logger
->set(l_pq_executing_high_water
, files_high_water
);
627 dout(10) << "in_flight.size() now " << in_flight
.size() << dendl
;
629 logger
->inc(l_pq_executed
);
632 void PurgeQueue::update_op_limit(const MDSMap
&mds_map
)
634 std::lock_guard
l(lock
);
637 dout(10) << "skipping; PurgeQueue is readonly" << dendl
;
641 uint64_t pg_count
= 0;
642 objecter
->with_osdmap([&](const OSDMap
& o
) {
643 // Number of PGs across all data pools
644 const std::vector
<int64_t> &data_pools
= mds_map
.get_data_pools();
645 for (const auto dp
: data_pools
) {
646 if (o
.get_pg_pool(dp
) == NULL
) {
647 // It is possible that we have an older OSDMap than MDSMap,
648 // because we don't start watching every OSDMap until after
649 // MDSRank is initialized
650 dout(4) << " data pool " << dp
<< " not found in OSDMap" << dendl
;
653 pg_count
+= o
.get_pg_num(dp
);
657 // Work out a limit based on n_pgs / n_mdss, multiplied by the user's
658 // preference for how many ops per PG
659 max_purge_ops
= uint64_t(((double)pg_count
/ (double)mds_map
.get_max_mds()) *
660 cct
->_conf
->mds_max_purge_ops_per_pg
);
662 // User may also specify a hard limit, apply this if so.
663 if (cct
->_conf
->mds_max_purge_ops
) {
664 max_purge_ops
= std::min(max_purge_ops
, cct
->_conf
->mds_max_purge_ops
);
668 void PurgeQueue::handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mds_map
)
670 if (changed
.count("mds_max_purge_ops")
671 || changed
.count("mds_max_purge_ops_per_pg")) {
672 update_op_limit(mds_map
);
673 } else if (changed
.count("mds_max_purge_files")) {
674 std::lock_guard
l(lock
);
675 if (in_flight
.empty()) {
676 // We might have gone from zero to a finite limit, so
677 // might need to kick off consume.
678 dout(4) << "maybe start work again (max_purge_files="
679 << g_conf()->mds_max_purge_files
<< dendl
;
680 finisher
.queue(new FunctionContext([this](int r
){
681 std::lock_guard
l(lock
);
688 bool PurgeQueue::drain(
690 uint64_t *progress_total
,
691 size_t *in_flight_count
694 std::lock_guard
l(lock
);
697 dout(10) << "skipping drain; PurgeQueue is readonly" << dendl
;
701 ceph_assert(progress
!= nullptr);
702 ceph_assert(progress_total
!= nullptr);
703 ceph_assert(in_flight_count
!= nullptr);
705 const bool done
= in_flight
.empty() && (
706 journaler
.get_read_pos() == journaler
.get_write_pos());
711 const uint64_t bytes_remaining
= journaler
.get_write_pos()
712 - journaler
.get_read_pos();
715 // Start of draining: remember how much there was outstanding at
716 // this point so that we can give a progress percentage later
719 // Life the op throttle as this daemon now has nothing to do but
720 // drain the purge queue, so do it as fast as we can.
721 max_purge_ops
= 0xffff;
724 drain_initial
= std::max(bytes_remaining
, drain_initial
);
726 *progress
= drain_initial
- bytes_remaining
;
727 *progress_total
= drain_initial
;
728 *in_flight_count
= in_flight
.size();
733 std::string_view
PurgeItem::get_type_str() const
736 case PurgeItem::NONE
: return "NONE";
737 case PurgeItem::PURGE_FILE
: return "PURGE_FILE";
738 case PurgeItem::PURGE_DIR
: return "PURGE_DIR";
739 case PurgeItem::TRUNCATE_FILE
: return "TRUNCATE_FILE";