]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/PurgeQueue.cc
fa5d43f6229f584c25e935b067c05aac15dd57a8
[ceph.git] / ceph / src / mds / PurgeQueue.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "common/debug.h"
16 #include "mds/mdstypes.h"
17 #include "mds/CInode.h"
18 #include "mds/MDCache.h"
19
20 #include "PurgeQueue.h"
21
22
23 #define dout_context cct
24 #define dout_subsys ceph_subsys_mds
25 #undef dout_prefix
26 #define dout_prefix _prefix(_dout, rank) << __func__ << ": "
27 static ostream& _prefix(std::ostream *_dout, mds_rank_t rank) {
28 return *_dout << "mds." << rank << ".purge_queue ";
29 }
30
31 void PurgeItem::encode(bufferlist &bl) const
32 {
33 ENCODE_START(1, 1, bl);
34 ::encode((uint8_t)action, bl);
35 ::encode(ino, bl);
36 ::encode(size, bl);
37 ::encode(layout, bl, CEPH_FEATURE_FS_FILE_LAYOUT_V2);
38 ::encode(old_pools, bl);
39 ::encode(snapc, bl);
40 ::encode(fragtree, bl);
41 ENCODE_FINISH(bl);
42 }
43
44 void PurgeItem::decode(bufferlist::iterator &p)
45 {
46 DECODE_START(1, p);
47 ::decode((uint8_t&)action, p);
48 ::decode(ino, p);
49 ::decode(size, p);
50 ::decode(layout, p);
51 ::decode(old_pools, p);
52 ::decode(snapc, p);
53 ::decode(fragtree, p);
54 DECODE_FINISH(p);
55 }
56
57 // TODO: if Objecter has any slow requests, take that as a hint and
58 // slow down our rate of purging (keep accepting pushes though)
59 PurgeQueue::PurgeQueue(
60 CephContext *cct_,
61 mds_rank_t rank_,
62 const int64_t metadata_pool_,
63 Objecter *objecter_,
64 Context *on_error_)
65 :
66 cct(cct_),
67 rank(rank_),
68 lock("PurgeQueue"),
69 metadata_pool(metadata_pool_),
70 finisher(cct, "PurgeQueue", "PQ_Finisher"),
71 timer(cct, lock),
72 filer(objecter_, &finisher),
73 objecter(objecter_),
74 journaler("pq", MDS_INO_PURGE_QUEUE + rank, metadata_pool,
75 CEPH_FS_ONDISK_MAGIC, objecter_, nullptr, 0,
76 &finisher),
77 on_error(on_error_),
78 ops_in_flight(0),
79 max_purge_ops(0),
80 drain_initial(0),
81 draining(false),
82 delayed_flush(nullptr),
83 recovered(false)
84 {
85 assert(cct != nullptr);
86 assert(on_error != nullptr);
87 assert(objecter != nullptr);
88 journaler.set_write_error_handler(on_error);
89 }
90
91 PurgeQueue::~PurgeQueue()
92 {
93 if (logger) {
94 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
95 }
96 }
97
98 void PurgeQueue::create_logger()
99 {
100 PerfCountersBuilder pcb(g_ceph_context, "purge_queue", l_pq_first, l_pq_last);
101
102 pcb.add_u64_counter(l_pq_executed, "pq_executed", "Purge queue tasks executed",
103 "purg", PerfCountersBuilder::PRIO_INTERESTING);
104
105 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
106 pcb.add_u64(l_pq_executing_ops, "pq_executing_ops", "Purge queue ops in flight");
107 pcb.add_u64(l_pq_executing, "pq_executing", "Purge queue tasks in flight");
108
109 logger.reset(pcb.create_perf_counters());
110 g_ceph_context->get_perfcounters_collection()->add(logger.get());
111 }
112
113 void PurgeQueue::init()
114 {
115 Mutex::Locker l(lock);
116
117 assert(logger != nullptr);
118
119 finisher.start();
120 timer.init();
121 }
122
123 void PurgeQueue::activate()
124 {
125 Mutex::Locker l(lock);
126 if (journaler.get_read_pos() == journaler.get_write_pos())
127 return;
128
129 if (in_flight.empty()) {
130 dout(4) << "start work (by drain)" << dendl;
131 finisher.queue(new FunctionContext([this](int r) {
132 Mutex::Locker l(lock);
133 _consume();
134 }));
135 }
136 }
137
138 void PurgeQueue::shutdown()
139 {
140 Mutex::Locker l(lock);
141
142 journaler.shutdown();
143 timer.shutdown();
144 finisher.stop();
145 }
146
147 void PurgeQueue::open(Context *completion)
148 {
149 dout(4) << "opening" << dendl;
150
151 Mutex::Locker l(lock);
152
153 if (completion)
154 waiting_for_recovery.push_back(completion);
155
156 journaler.recover(new FunctionContext([this](int r){
157 if (r == -ENOENT) {
158 dout(1) << "Purge Queue not found, assuming this is an upgrade and "
159 "creating it." << dendl;
160 create(NULL);
161 } else if (r == 0) {
162 Mutex::Locker l(lock);
163 dout(4) << "open complete" << dendl;
164
165 // Journaler only guarantees entries before head write_pos have been
166 // fully flushed. Before appending new entries, we need to find and
167 // drop any partial written entry.
168 if (journaler.last_committed.write_pos < journaler.get_write_pos()) {
169 dout(4) << "recovering write_pos" << dendl;
170 journaler.set_read_pos(journaler.last_committed.write_pos);
171 _recover();
172 return;
173 }
174
175 journaler.set_writeable();
176 recovered = true;
177 finish_contexts(g_ceph_context, waiting_for_recovery);
178 } else {
179 derr << "Error " << r << " loading Journaler" << dendl;
180 on_error->complete(r);
181 }
182 }));
183 }
184
185 void PurgeQueue::wait_for_recovery(Context* c)
186 {
187 Mutex::Locker l(lock);
188 if (recovered)
189 c->complete(0);
190 else
191 waiting_for_recovery.push_back(c);
192 }
193
194 void PurgeQueue::_recover()
195 {
196 assert(lock.is_locked_by_me());
197
198 // Journaler::is_readable() adjusts write_pos if partial entry is encountered
199 while (1) {
200 if (!journaler.is_readable() &&
201 !journaler.get_error() &&
202 journaler.get_read_pos() < journaler.get_write_pos()) {
203 journaler.wait_for_readable(new FunctionContext([this](int r) {
204 Mutex::Locker l(lock);
205 _recover();
206 }));
207 return;
208 }
209
210 if (journaler.get_error()) {
211 int r = journaler.get_error();
212 derr << "Error " << r << " recovering write_pos" << dendl;
213 on_error->complete(r);
214 return;
215 }
216
217 if (journaler.get_read_pos() == journaler.get_write_pos()) {
218 dout(4) << "write_pos recovered" << dendl;
219 // restore original read_pos
220 journaler.set_read_pos(journaler.last_committed.expire_pos);
221 journaler.set_writeable();
222 recovered = true;
223 finish_contexts(g_ceph_context, waiting_for_recovery);
224 return;
225 }
226
227 bufferlist bl;
228 bool readable = journaler.try_read_entry(bl);
229 assert(readable); // we checked earlier
230 }
231 }
232
233 void PurgeQueue::create(Context *fin)
234 {
235 dout(4) << "creating" << dendl;
236 Mutex::Locker l(lock);
237
238 if (fin)
239 waiting_for_recovery.push_back(fin);
240
241 file_layout_t layout = file_layout_t::get_default();
242 layout.pool_id = metadata_pool;
243 journaler.set_writeable();
244 journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
245 journaler.write_head(new FunctionContext([this](int r) {
246 Mutex::Locker l(lock);
247 recovered = true;
248 finish_contexts(g_ceph_context, waiting_for_recovery);
249 }));
250 }
251
252 /**
253 * The `completion` context will always be called back via a Finisher
254 */
255 void PurgeQueue::push(const PurgeItem &pi, Context *completion)
256 {
257 dout(4) << "pushing inode 0x" << std::hex << pi.ino << std::dec << dendl;
258 Mutex::Locker l(lock);
259
260 // Callers should have waited for open() before using us
261 assert(!journaler.is_readonly());
262
263 bufferlist bl;
264
265 ::encode(pi, bl);
266 journaler.append_entry(bl);
267 journaler.wait_for_flush(completion);
268
269 // Maybe go ahead and do something with it right away
270 bool could_consume = _consume();
271 if (!could_consume) {
272 // Usually, it is not necessary to explicitly flush here, because the reader
273 // will get flushes generated inside Journaler::is_readable. However,
274 // if we remain in a can_consume()==false state for a long period then
275 // we should flush in order to allow MDCache to drop its strays rather
276 // than having them wait for purgequeue to progress.
277 if (!delayed_flush) {
278 delayed_flush = new FunctionContext([this](int r){
279 delayed_flush = nullptr;
280 journaler.flush();
281 });
282
283 timer.add_event_after(
284 g_conf->mds_purge_queue_busy_flush_period,
285 delayed_flush);
286 }
287 }
288 }
289
290 uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const
291 {
292 uint32_t ops_required = 0;
293 if (item.action == PurgeItem::PURGE_DIR) {
294 // Directory, count dirfrags to be deleted
295 std::list<frag_t> ls;
296 if (!item.fragtree.is_leaf(frag_t())) {
297 item.fragtree.get_leaves(ls);
298 }
299 // One for the root, plus any leaves
300 ops_required = 1 + ls.size();
301 } else {
302 // File, work out concurrent Filer::purge deletes
303 const uint64_t num = (item.size > 0) ?
304 Striper::get_num_objects(item.layout, item.size) : 1;
305
306 ops_required = MIN(num, g_conf->filer_max_purge_ops);
307
308 // Account for removing (or zeroing) backtrace
309 ops_required += 1;
310
311 // Account for deletions for old pools
312 if (item.action != PurgeItem::TRUNCATE_FILE) {
313 ops_required += item.old_pools.size();
314 }
315 }
316
317 return ops_required;
318 }
319
320 bool PurgeQueue::can_consume()
321 {
322 dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, "
323 << in_flight.size() << "/" << g_conf->mds_max_purge_files
324 << " files" << dendl;
325
326 if (in_flight.size() == 0 && cct->_conf->mds_max_purge_files > 0) {
327 // Always permit consumption if nothing is in flight, so that the ops
328 // limit can never be so low as to forbid all progress (unless
329 // administrator has deliberately paused purging by setting max
330 // purge files to zero).
331 return true;
332 }
333
334 if (ops_in_flight >= max_purge_ops) {
335 dout(20) << "Throttling on op limit " << ops_in_flight << "/"
336 << max_purge_ops << dendl;
337 return false;
338 }
339
340 if (in_flight.size() >= cct->_conf->mds_max_purge_files) {
341 dout(20) << "Throttling on item limit " << in_flight.size()
342 << "/" << cct->_conf->mds_max_purge_files << dendl;
343 return false;
344 } else {
345 return true;
346 }
347 }
348
349 bool PurgeQueue::_consume()
350 {
351 assert(lock.is_locked_by_me());
352
353 bool could_consume = false;
354 while(can_consume()) {
355
356 if (delayed_flush) {
357 // We are now going to read from the journal, so any proactive
358 // flush is no longer necessary. This is not functionally necessary
359 // but it can avoid generating extra fragmented flush IOs.
360 timer.cancel_event(delayed_flush);
361 delayed_flush = nullptr;
362 }
363
364 if (int r = journaler.get_error()) {
365 derr << "Error " << r << " recovering write_pos" << dendl;
366 on_error->complete(r);
367 return could_consume;
368 }
369
370 if (!journaler.is_readable()) {
371 dout(10) << " not readable right now" << dendl;
372 // Because we are the writer and the reader of the journal
373 // via the same Journaler instance, we never need to reread_head
374 if (!journaler.have_waiter()) {
375 journaler.wait_for_readable(new FunctionContext([this](int r) {
376 Mutex::Locker l(lock);
377 if (r == 0) {
378 _consume();
379 } else if (r != -EAGAIN) {
380 on_error->complete(r);
381 }
382 }));
383 }
384
385 return could_consume;
386 }
387
388 could_consume = true;
389 // The journaler is readable: consume an entry
390 bufferlist bl;
391 bool readable = journaler.try_read_entry(bl);
392 assert(readable); // we checked earlier
393
394 dout(20) << " decoding entry" << dendl;
395 PurgeItem item;
396 bufferlist::iterator q = bl.begin();
397 try {
398 ::decode(item, q);
399 } catch (const buffer::error &err) {
400 derr << "Decode error at read_pos=0x" << std::hex
401 << journaler.get_read_pos() << dendl;
402 on_error->complete(0);
403 }
404 dout(20) << " executing item (0x" << std::hex << item.ino
405 << std::dec << ")" << dendl;
406 _execute_item(item, journaler.get_read_pos());
407 }
408
409 dout(10) << " cannot consume right now" << dendl;
410
411 return could_consume;
412 }
413
414 void PurgeQueue::_execute_item(
415 const PurgeItem &item,
416 uint64_t expire_to)
417 {
418 assert(lock.is_locked_by_me());
419
420 in_flight[expire_to] = item;
421 logger->set(l_pq_executing, in_flight.size());
422 ops_in_flight += _calculate_ops(item);
423 logger->set(l_pq_executing_ops, ops_in_flight);
424
425 SnapContext nullsnapc;
426
427 C_GatherBuilder gather(cct);
428 if (item.action == PurgeItem::PURGE_FILE) {
429 if (item.size > 0) {
430 uint64_t num = Striper::get_num_objects(item.layout, item.size);
431 dout(10) << " 0~" << item.size << " objects 0~" << num
432 << " snapc " << item.snapc << " on " << item.ino << dendl;
433 filer.purge_range(item.ino, &item.layout, item.snapc,
434 0, num, ceph::real_clock::now(), 0,
435 gather.new_sub());
436 }
437
438 // remove the backtrace object if it was not purged
439 object_t oid = CInode::get_object_name(item.ino, frag_t(), "");
440 if (!gather.has_subs() || !item.layout.pool_ns.empty()) {
441 object_locator_t oloc(item.layout.pool_id);
442 dout(10) << " remove backtrace object " << oid
443 << " pool " << oloc.pool << " snapc " << item.snapc << dendl;
444 objecter->remove(oid, oloc, item.snapc,
445 ceph::real_clock::now(), 0,
446 gather.new_sub());
447 }
448
449 // remove old backtrace objects
450 for (const auto &p : item.old_pools) {
451 object_locator_t oloc(p);
452 dout(10) << " remove backtrace object " << oid
453 << " old pool " << p << " snapc " << item.snapc << dendl;
454 objecter->remove(oid, oloc, item.snapc,
455 ceph::real_clock::now(), 0,
456 gather.new_sub());
457 }
458 } else if (item.action == PurgeItem::PURGE_DIR) {
459 object_locator_t oloc(metadata_pool);
460 std::list<frag_t> frags;
461 if (!item.fragtree.is_leaf(frag_t()))
462 item.fragtree.get_leaves(frags);
463 frags.push_back(frag_t());
464 for (const auto &frag : frags) {
465 object_t oid = CInode::get_object_name(item.ino, frag, "");
466 dout(10) << " remove dirfrag " << oid << dendl;
467 objecter->remove(oid, oloc, nullsnapc,
468 ceph::real_clock::now(),
469 0, gather.new_sub());
470 }
471 } else if (item.action == PurgeItem::TRUNCATE_FILE) {
472 const uint64_t num = Striper::get_num_objects(item.layout, item.size);
473 dout(10) << " 0~" << item.size << " objects 0~" << num
474 << " snapc " << item.snapc << " on " << item.ino << dendl;
475
476 // keep backtrace object
477 if (num > 1) {
478 filer.purge_range(item.ino, &item.layout, item.snapc,
479 1, num - 1, ceph::real_clock::now(),
480 0, gather.new_sub());
481 }
482 filer.zero(item.ino, &item.layout, item.snapc,
483 0, item.layout.object_size,
484 ceph::real_clock::now(),
485 0, true, gather.new_sub());
486 } else {
487 derr << "Invalid item (action=" << item.action << ") in purge queue, "
488 "dropping it" << dendl;
489 in_flight.erase(expire_to);
490 logger->set(l_pq_executing, in_flight.size());
491 return;
492 }
493 assert(gather.has_subs());
494
495 gather.set_finisher(new C_OnFinisher(
496 new FunctionContext([this, expire_to](int r){
497 Mutex::Locker l(lock);
498 _execute_item_complete(expire_to);
499
500 _consume();
501
502 // Have we gone idle? If so, do an extra write_head now instead of
503 // waiting for next flush after journaler_write_head_interval.
504 // Also do this periodically even if not idle, so that the persisted
505 // expire_pos doesn't fall too far behind our progress when consuming
506 // a very long queue.
507 if (in_flight.empty() || journaler.write_head_needed()) {
508 journaler.write_head(new FunctionContext([this](int r){
509 journaler.trim();
510 }));
511 }
512 }), &finisher));
513
514 gather.activate();
515 }
516
517 void PurgeQueue::_execute_item_complete(
518 uint64_t expire_to)
519 {
520 assert(lock.is_locked_by_me());
521 dout(10) << "complete at 0x" << std::hex << expire_to << std::dec << dendl;
522 assert(in_flight.count(expire_to) == 1);
523
524 auto iter = in_flight.find(expire_to);
525 assert(iter != in_flight.end());
526 if (iter == in_flight.begin()) {
527 // This was the lowest journal position in flight, so we can now
528 // safely expire the journal up to here.
529 dout(10) << "expiring to 0x" << std::hex << expire_to << std::dec << dendl;
530 journaler.set_expire_pos(expire_to);
531 } else {
532 // This is completely fine, we're not supposed to purge files in
533 // order when doing them in parallel.
534 dout(10) << "non-sequential completion, not expiring anything" << dendl;
535 }
536
537 ops_in_flight -= _calculate_ops(iter->second);
538 logger->set(l_pq_executing_ops, ops_in_flight);
539
540 dout(10) << "completed item for ino 0x" << std::hex << iter->second.ino
541 << std::dec << dendl;
542
543 in_flight.erase(iter);
544 logger->set(l_pq_executing, in_flight.size());
545 dout(10) << "in_flight.size() now " << in_flight.size() << dendl;
546
547 logger->inc(l_pq_executed);
548 }
549
550 void PurgeQueue::update_op_limit(const MDSMap &mds_map)
551 {
552 Mutex::Locker l(lock);
553
554 uint64_t pg_count = 0;
555 objecter->with_osdmap([&](const OSDMap& o) {
556 // Number of PGs across all data pools
557 const std::vector<int64_t> &data_pools = mds_map.get_data_pools();
558 for (const auto dp : data_pools) {
559 if (o.get_pg_pool(dp) == NULL) {
560 // It is possible that we have an older OSDMap than MDSMap,
561 // because we don't start watching every OSDMap until after
562 // MDSRank is initialized
563 dout(4) << " data pool " << dp << " not found in OSDMap" << dendl;
564 continue;
565 }
566 pg_count += o.get_pg_num(dp);
567 }
568 });
569
570 // Work out a limit based on n_pgs / n_mdss, multiplied by the user's
571 // preference for how many ops per PG
572 max_purge_ops = uint64_t(((double)pg_count / (double)mds_map.get_max_mds()) *
573 cct->_conf->mds_max_purge_ops_per_pg);
574
575 // User may also specify a hard limit, apply this if so.
576 if (cct->_conf->mds_max_purge_ops) {
577 max_purge_ops = MIN(max_purge_ops, cct->_conf->mds_max_purge_ops);
578 }
579 }
580
581 void PurgeQueue::handle_conf_change(const struct md_config_t *conf,
582 const std::set <std::string> &changed,
583 const MDSMap &mds_map)
584 {
585 if (changed.count("mds_max_purge_ops")
586 || changed.count("mds_max_purge_ops_per_pg")) {
587 update_op_limit(mds_map);
588 } else if (changed.count("mds_max_purge_files")) {
589 Mutex::Locker l(lock);
590
591 if (in_flight.empty()) {
592 // We might have gone from zero to a finite limit, so
593 // might need to kick off consume.
594 dout(4) << "maybe start work again (max_purge_files="
595 << conf->mds_max_purge_files << dendl;
596 finisher.queue(new FunctionContext([this](int r){
597 Mutex::Locker l(lock);
598 _consume();
599 }));
600 }
601 }
602 }
603
604 bool PurgeQueue::drain(
605 uint64_t *progress,
606 uint64_t *progress_total,
607 size_t *in_flight_count
608 )
609 {
610 assert(progress != nullptr);
611 assert(progress_total != nullptr);
612 assert(in_flight_count != nullptr);
613
614 const bool done = in_flight.empty() && (
615 journaler.get_read_pos() == journaler.get_write_pos());
616 if (done) {
617 return true;
618 }
619
620 const uint64_t bytes_remaining = journaler.get_write_pos()
621 - journaler.get_read_pos();
622
623 if (!draining) {
624 // Start of draining: remember how much there was outstanding at
625 // this point so that we can give a progress percentage later
626 draining = true;
627
628 // Life the op throttle as this daemon now has nothing to do but
629 // drain the purge queue, so do it as fast as we can.
630 max_purge_ops = 0xffff;
631 }
632
633 drain_initial = max(bytes_remaining, drain_initial);
634
635 *progress = drain_initial - bytes_remaining;
636 *progress_total = drain_initial;
637 *in_flight_count = in_flight.size();
638
639 return false;
640 }
641