1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2015 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "common/debug.h"
16 #include "mds/mdstypes.h"
17 #include "mds/CInode.h"
18 #include "mds/MDCache.h"
20 #include "PurgeQueue.h"
23 #define dout_context cct
24 #define dout_subsys ceph_subsys_mds
26 #define dout_prefix _prefix(_dout, rank) << __func__ << ": "
27 static ostream
& _prefix(std::ostream
*_dout
, mds_rank_t rank
) {
28 return *_dout
<< "mds." << rank
<< ".purge_queue ";
31 void PurgeItem::encode(bufferlist
&bl
) const
33 ENCODE_START(1, 1, bl
);
34 ::encode((uint8_t)action
, bl
);
37 ::encode(layout
, bl
, CEPH_FEATURE_FS_FILE_LAYOUT_V2
);
38 ::encode(old_pools
, bl
);
40 ::encode(fragtree
, bl
);
44 void PurgeItem::decode(bufferlist::iterator
&p
)
47 ::decode((uint8_t&)action
, p
);
51 ::decode(old_pools
, p
);
53 ::decode(fragtree
, p
);
57 // TODO: if Objecter has any slow requests, take that as a hint and
58 // slow down our rate of purging (keep accepting pushes though)
59 PurgeQueue::PurgeQueue(
62 const int64_t metadata_pool_
,
69 metadata_pool(metadata_pool_
),
70 finisher(cct
, "PurgeQueue", "PQ_Finisher"),
72 filer(objecter_
, &finisher
),
74 journaler("pq", MDS_INO_PURGE_QUEUE
+ rank
, metadata_pool
,
75 CEPH_FS_ONDISK_MAGIC
, objecter_
, nullptr, 0,
82 delayed_flush(nullptr),
85 assert(cct
!= nullptr);
86 assert(on_error
!= nullptr);
87 assert(objecter
!= nullptr);
88 journaler
.set_write_error_handler(on_error
);
91 PurgeQueue::~PurgeQueue()
94 g_ceph_context
->get_perfcounters_collection()->remove(logger
.get());
98 void PurgeQueue::create_logger()
100 PerfCountersBuilder
pcb(g_ceph_context
, "purge_queue", l_pq_first
, l_pq_last
);
102 pcb
.add_u64_counter(l_pq_executed
, "pq_executed", "Purge queue tasks executed",
103 "purg", PerfCountersBuilder::PRIO_INTERESTING
);
105 pcb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
106 pcb
.add_u64(l_pq_executing_ops
, "pq_executing_ops", "Purge queue ops in flight");
107 pcb
.add_u64(l_pq_executing
, "pq_executing", "Purge queue tasks in flight");
109 logger
.reset(pcb
.create_perf_counters());
110 g_ceph_context
->get_perfcounters_collection()->add(logger
.get());
113 void PurgeQueue::init()
115 Mutex::Locker
l(lock
);
117 assert(logger
!= nullptr);
123 void PurgeQueue::activate()
125 Mutex::Locker
l(lock
);
126 if (journaler
.get_read_pos() == journaler
.get_write_pos())
129 if (in_flight
.empty()) {
130 dout(4) << "start work (by drain)" << dendl
;
131 finisher
.queue(new FunctionContext([this](int r
) {
132 Mutex::Locker
l(lock
);
138 void PurgeQueue::shutdown()
140 Mutex::Locker
l(lock
);
142 journaler
.shutdown();
147 void PurgeQueue::open(Context
*completion
)
149 dout(4) << "opening" << dendl
;
151 Mutex::Locker
l(lock
);
154 waiting_for_recovery
.push_back(completion
);
156 journaler
.recover(new FunctionContext([this](int r
){
158 dout(1) << "Purge Queue not found, assuming this is an upgrade and "
159 "creating it." << dendl
;
162 Mutex::Locker
l(lock
);
163 dout(4) << "open complete" << dendl
;
165 // Journaler only guarantees entries before head write_pos have been
166 // fully flushed. Before appending new entries, we need to find and
167 // drop any partial written entry.
168 if (journaler
.last_committed
.write_pos
< journaler
.get_write_pos()) {
169 dout(4) << "recovering write_pos" << dendl
;
170 journaler
.set_read_pos(journaler
.last_committed
.write_pos
);
175 journaler
.set_writeable();
177 finish_contexts(g_ceph_context
, waiting_for_recovery
);
179 derr
<< "Error " << r
<< " loading Journaler" << dendl
;
180 on_error
->complete(r
);
185 void PurgeQueue::wait_for_recovery(Context
* c
)
187 Mutex::Locker
l(lock
);
191 waiting_for_recovery
.push_back(c
);
194 void PurgeQueue::_recover()
196 assert(lock
.is_locked_by_me());
198 // Journaler::is_readable() adjusts write_pos if partial entry is encountered
200 if (!journaler
.is_readable() &&
201 !journaler
.get_error() &&
202 journaler
.get_read_pos() < journaler
.get_write_pos()) {
203 journaler
.wait_for_readable(new FunctionContext([this](int r
) {
204 Mutex::Locker
l(lock
);
210 if (journaler
.get_error()) {
211 int r
= journaler
.get_error();
212 derr
<< "Error " << r
<< " recovering write_pos" << dendl
;
213 on_error
->complete(r
);
217 if (journaler
.get_read_pos() == journaler
.get_write_pos()) {
218 dout(4) << "write_pos recovered" << dendl
;
219 // restore original read_pos
220 journaler
.set_read_pos(journaler
.last_committed
.expire_pos
);
221 journaler
.set_writeable();
223 finish_contexts(g_ceph_context
, waiting_for_recovery
);
228 bool readable
= journaler
.try_read_entry(bl
);
229 assert(readable
); // we checked earlier
233 void PurgeQueue::create(Context
*fin
)
235 dout(4) << "creating" << dendl
;
236 Mutex::Locker
l(lock
);
239 waiting_for_recovery
.push_back(fin
);
241 file_layout_t layout
= file_layout_t::get_default();
242 layout
.pool_id
= metadata_pool
;
243 journaler
.set_writeable();
244 journaler
.create(&layout
, JOURNAL_FORMAT_RESILIENT
);
245 journaler
.write_head(new FunctionContext([this](int r
) {
246 Mutex::Locker
l(lock
);
248 finish_contexts(g_ceph_context
, waiting_for_recovery
);
253 * The `completion` context will always be called back via a Finisher
255 void PurgeQueue::push(const PurgeItem
&pi
, Context
*completion
)
257 dout(4) << "pushing inode 0x" << std::hex
<< pi
.ino
<< std::dec
<< dendl
;
258 Mutex::Locker
l(lock
);
260 // Callers should have waited for open() before using us
261 assert(!journaler
.is_readonly());
266 journaler
.append_entry(bl
);
267 journaler
.wait_for_flush(completion
);
269 // Maybe go ahead and do something with it right away
270 bool could_consume
= _consume();
271 if (!could_consume
) {
272 // Usually, it is not necessary to explicitly flush here, because the reader
273 // will get flushes generated inside Journaler::is_readable. However,
274 // if we remain in a can_consume()==false state for a long period then
275 // we should flush in order to allow MDCache to drop its strays rather
276 // than having them wait for purgequeue to progress.
277 if (!delayed_flush
) {
278 delayed_flush
= new FunctionContext([this](int r
){
279 delayed_flush
= nullptr;
283 timer
.add_event_after(
284 g_conf
->mds_purge_queue_busy_flush_period
,
290 uint32_t PurgeQueue::_calculate_ops(const PurgeItem
&item
) const
292 uint32_t ops_required
= 0;
293 if (item
.action
== PurgeItem::PURGE_DIR
) {
294 // Directory, count dirfrags to be deleted
295 std::list
<frag_t
> ls
;
296 if (!item
.fragtree
.is_leaf(frag_t())) {
297 item
.fragtree
.get_leaves(ls
);
299 // One for the root, plus any leaves
300 ops_required
= 1 + ls
.size();
302 // File, work out concurrent Filer::purge deletes
303 const uint64_t num
= (item
.size
> 0) ?
304 Striper::get_num_objects(item
.layout
, item
.size
) : 1;
306 ops_required
= MIN(num
, g_conf
->filer_max_purge_ops
);
308 // Account for removing (or zeroing) backtrace
311 // Account for deletions for old pools
312 if (item
.action
!= PurgeItem::TRUNCATE_FILE
) {
313 ops_required
+= item
.old_pools
.size();
320 bool PurgeQueue::can_consume()
322 dout(20) << ops_in_flight
<< "/" << max_purge_ops
<< " ops, "
323 << in_flight
.size() << "/" << g_conf
->mds_max_purge_files
324 << " files" << dendl
;
326 if (in_flight
.size() == 0 && cct
->_conf
->mds_max_purge_files
> 0) {
327 // Always permit consumption if nothing is in flight, so that the ops
328 // limit can never be so low as to forbid all progress (unless
329 // administrator has deliberately paused purging by setting max
330 // purge files to zero).
334 if (ops_in_flight
>= max_purge_ops
) {
335 dout(20) << "Throttling on op limit " << ops_in_flight
<< "/"
336 << max_purge_ops
<< dendl
;
340 if (in_flight
.size() >= cct
->_conf
->mds_max_purge_files
) {
341 dout(20) << "Throttling on item limit " << in_flight
.size()
342 << "/" << cct
->_conf
->mds_max_purge_files
<< dendl
;
349 bool PurgeQueue::_consume()
351 assert(lock
.is_locked_by_me());
353 bool could_consume
= false;
354 while(can_consume()) {
357 // We are now going to read from the journal, so any proactive
358 // flush is no longer necessary. This is not functionally necessary
359 // but it can avoid generating extra fragmented flush IOs.
360 timer
.cancel_event(delayed_flush
);
361 delayed_flush
= nullptr;
364 if (int r
= journaler
.get_error()) {
365 derr
<< "Error " << r
<< " recovering write_pos" << dendl
;
366 on_error
->complete(r
);
367 return could_consume
;
370 if (!journaler
.is_readable()) {
371 dout(10) << " not readable right now" << dendl
;
372 // Because we are the writer and the reader of the journal
373 // via the same Journaler instance, we never need to reread_head
374 if (!journaler
.have_waiter()) {
375 journaler
.wait_for_readable(new FunctionContext([this](int r
) {
376 Mutex::Locker
l(lock
);
379 } else if (r
!= -EAGAIN
) {
380 on_error
->complete(r
);
385 return could_consume
;
388 could_consume
= true;
389 // The journaler is readable: consume an entry
391 bool readable
= journaler
.try_read_entry(bl
);
392 assert(readable
); // we checked earlier
394 dout(20) << " decoding entry" << dendl
;
396 bufferlist::iterator q
= bl
.begin();
399 } catch (const buffer::error
&err
) {
400 derr
<< "Decode error at read_pos=0x" << std::hex
401 << journaler
.get_read_pos() << dendl
;
402 on_error
->complete(0);
404 dout(20) << " executing item (0x" << std::hex
<< item
.ino
405 << std::dec
<< ")" << dendl
;
406 _execute_item(item
, journaler
.get_read_pos());
409 dout(10) << " cannot consume right now" << dendl
;
411 return could_consume
;
414 void PurgeQueue::_execute_item(
415 const PurgeItem
&item
,
418 assert(lock
.is_locked_by_me());
420 in_flight
[expire_to
] = item
;
421 logger
->set(l_pq_executing
, in_flight
.size());
422 ops_in_flight
+= _calculate_ops(item
);
423 logger
->set(l_pq_executing_ops
, ops_in_flight
);
425 SnapContext nullsnapc
;
427 C_GatherBuilder
gather(cct
);
428 if (item
.action
== PurgeItem::PURGE_FILE
) {
430 uint64_t num
= Striper::get_num_objects(item
.layout
, item
.size
);
431 dout(10) << " 0~" << item
.size
<< " objects 0~" << num
432 << " snapc " << item
.snapc
<< " on " << item
.ino
<< dendl
;
433 filer
.purge_range(item
.ino
, &item
.layout
, item
.snapc
,
434 0, num
, ceph::real_clock::now(), 0,
438 // remove the backtrace object if it was not purged
439 object_t oid
= CInode::get_object_name(item
.ino
, frag_t(), "");
440 if (!gather
.has_subs() || !item
.layout
.pool_ns
.empty()) {
441 object_locator_t
oloc(item
.layout
.pool_id
);
442 dout(10) << " remove backtrace object " << oid
443 << " pool " << oloc
.pool
<< " snapc " << item
.snapc
<< dendl
;
444 objecter
->remove(oid
, oloc
, item
.snapc
,
445 ceph::real_clock::now(), 0,
449 // remove old backtrace objects
450 for (const auto &p
: item
.old_pools
) {
451 object_locator_t
oloc(p
);
452 dout(10) << " remove backtrace object " << oid
453 << " old pool " << p
<< " snapc " << item
.snapc
<< dendl
;
454 objecter
->remove(oid
, oloc
, item
.snapc
,
455 ceph::real_clock::now(), 0,
458 } else if (item
.action
== PurgeItem::PURGE_DIR
) {
459 object_locator_t
oloc(metadata_pool
);
460 std::list
<frag_t
> frags
;
461 if (!item
.fragtree
.is_leaf(frag_t()))
462 item
.fragtree
.get_leaves(frags
);
463 frags
.push_back(frag_t());
464 for (const auto &frag
: frags
) {
465 object_t oid
= CInode::get_object_name(item
.ino
, frag
, "");
466 dout(10) << " remove dirfrag " << oid
<< dendl
;
467 objecter
->remove(oid
, oloc
, nullsnapc
,
468 ceph::real_clock::now(),
469 0, gather
.new_sub());
471 } else if (item
.action
== PurgeItem::TRUNCATE_FILE
) {
472 const uint64_t num
= Striper::get_num_objects(item
.layout
, item
.size
);
473 dout(10) << " 0~" << item
.size
<< " objects 0~" << num
474 << " snapc " << item
.snapc
<< " on " << item
.ino
<< dendl
;
476 // keep backtrace object
478 filer
.purge_range(item
.ino
, &item
.layout
, item
.snapc
,
479 1, num
- 1, ceph::real_clock::now(),
480 0, gather
.new_sub());
482 filer
.zero(item
.ino
, &item
.layout
, item
.snapc
,
483 0, item
.layout
.object_size
,
484 ceph::real_clock::now(),
485 0, true, gather
.new_sub());
487 derr
<< "Invalid item (action=" << item
.action
<< ") in purge queue, "
488 "dropping it" << dendl
;
489 in_flight
.erase(expire_to
);
490 logger
->set(l_pq_executing
, in_flight
.size());
493 assert(gather
.has_subs());
495 gather
.set_finisher(new C_OnFinisher(
496 new FunctionContext([this, expire_to
](int r
){
497 Mutex::Locker
l(lock
);
498 _execute_item_complete(expire_to
);
502 // Have we gone idle? If so, do an extra write_head now instead of
503 // waiting for next flush after journaler_write_head_interval.
504 // Also do this periodically even if not idle, so that the persisted
505 // expire_pos doesn't fall too far behind our progress when consuming
506 // a very long queue.
507 if (in_flight
.empty() || journaler
.write_head_needed()) {
508 journaler
.write_head(new FunctionContext([this](int r
){
517 void PurgeQueue::_execute_item_complete(
520 assert(lock
.is_locked_by_me());
521 dout(10) << "complete at 0x" << std::hex
<< expire_to
<< std::dec
<< dendl
;
522 assert(in_flight
.count(expire_to
) == 1);
524 auto iter
= in_flight
.find(expire_to
);
525 assert(iter
!= in_flight
.end());
526 if (iter
== in_flight
.begin()) {
527 // This was the lowest journal position in flight, so we can now
528 // safely expire the journal up to here.
529 dout(10) << "expiring to 0x" << std::hex
<< expire_to
<< std::dec
<< dendl
;
530 journaler
.set_expire_pos(expire_to
);
532 // This is completely fine, we're not supposed to purge files in
533 // order when doing them in parallel.
534 dout(10) << "non-sequential completion, not expiring anything" << dendl
;
537 ops_in_flight
-= _calculate_ops(iter
->second
);
538 logger
->set(l_pq_executing_ops
, ops_in_flight
);
540 dout(10) << "completed item for ino 0x" << std::hex
<< iter
->second
.ino
541 << std::dec
<< dendl
;
543 in_flight
.erase(iter
);
544 logger
->set(l_pq_executing
, in_flight
.size());
545 dout(10) << "in_flight.size() now " << in_flight
.size() << dendl
;
547 logger
->inc(l_pq_executed
);
550 void PurgeQueue::update_op_limit(const MDSMap
&mds_map
)
552 Mutex::Locker
l(lock
);
554 uint64_t pg_count
= 0;
555 objecter
->with_osdmap([&](const OSDMap
& o
) {
556 // Number of PGs across all data pools
557 const std::vector
<int64_t> &data_pools
= mds_map
.get_data_pools();
558 for (const auto dp
: data_pools
) {
559 if (o
.get_pg_pool(dp
) == NULL
) {
560 // It is possible that we have an older OSDMap than MDSMap,
561 // because we don't start watching every OSDMap until after
562 // MDSRank is initialized
563 dout(4) << " data pool " << dp
<< " not found in OSDMap" << dendl
;
566 pg_count
+= o
.get_pg_num(dp
);
570 // Work out a limit based on n_pgs / n_mdss, multiplied by the user's
571 // preference for how many ops per PG
572 max_purge_ops
= uint64_t(((double)pg_count
/ (double)mds_map
.get_max_mds()) *
573 cct
->_conf
->mds_max_purge_ops_per_pg
);
575 // User may also specify a hard limit, apply this if so.
576 if (cct
->_conf
->mds_max_purge_ops
) {
577 max_purge_ops
= MIN(max_purge_ops
, cct
->_conf
->mds_max_purge_ops
);
581 void PurgeQueue::handle_conf_change(const struct md_config_t
*conf
,
582 const std::set
<std::string
> &changed
,
583 const MDSMap
&mds_map
)
585 if (changed
.count("mds_max_purge_ops")
586 || changed
.count("mds_max_purge_ops_per_pg")) {
587 update_op_limit(mds_map
);
588 } else if (changed
.count("mds_max_purge_files")) {
589 Mutex::Locker
l(lock
);
591 if (in_flight
.empty()) {
592 // We might have gone from zero to a finite limit, so
593 // might need to kick off consume.
594 dout(4) << "maybe start work again (max_purge_files="
595 << conf
->mds_max_purge_files
<< dendl
;
596 finisher
.queue(new FunctionContext([this](int r
){
597 Mutex::Locker
l(lock
);
604 bool PurgeQueue::drain(
606 uint64_t *progress_total
,
607 size_t *in_flight_count
610 assert(progress
!= nullptr);
611 assert(progress_total
!= nullptr);
612 assert(in_flight_count
!= nullptr);
614 const bool done
= in_flight
.empty() && (
615 journaler
.get_read_pos() == journaler
.get_write_pos());
620 const uint64_t bytes_remaining
= journaler
.get_write_pos()
621 - journaler
.get_read_pos();
624 // Start of draining: remember how much there was outstanding at
625 // this point so that we can give a progress percentage later
628 // Life the op throttle as this daemon now has nothing to do but
629 // drain the purge queue, so do it as fast as we can.
630 max_purge_ops
= 0xffff;
633 drain_initial
= max(bytes_remaining
, drain_initial
);
635 *progress
= drain_initial
- bytes_remaining
;
636 *progress_total
= drain_initial
;
637 *in_flight_count
= in_flight
.size();