]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/PurgeQueue.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mds / PurgeQueue.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "common/debug.h"
16#include "mds/mdstypes.h"
17#include "mds/CInode.h"
18#include "mds/MDCache.h"
19
20#include "PurgeQueue.h"
21
22
23#define dout_context cct
24#define dout_subsys ceph_subsys_mds
25#undef dout_prefix
26#define dout_prefix _prefix(_dout, rank) << __func__ << ": "
27static ostream& _prefix(std::ostream *_dout, mds_rank_t rank) {
28 return *_dout << "mds." << rank << ".purge_queue ";
29}
30
31void PurgeItem::encode(bufferlist &bl) const
32{
33 ENCODE_START(1, 1, bl);
34 ::encode((uint8_t)action, bl);
35 ::encode(ino, bl);
36 ::encode(size, bl);
37 ::encode(layout, bl, CEPH_FEATURE_FS_FILE_LAYOUT_V2);
38 ::encode(old_pools, bl);
39 ::encode(snapc, bl);
40 ::encode(fragtree, bl);
41 ENCODE_FINISH(bl);
42}
43
44void PurgeItem::decode(bufferlist::iterator &p)
45{
46 DECODE_START(1, p);
47 ::decode((uint8_t&)action, p);
48 ::decode(ino, p);
49 ::decode(size, p);
50 ::decode(layout, p);
51 ::decode(old_pools, p);
52 ::decode(snapc, p);
53 ::decode(fragtree, p);
54 DECODE_FINISH(p);
55}
56
57// TODO: if Objecter has any slow requests, take that as a hint and
58// slow down our rate of purging (keep accepting pushes though)
59PurgeQueue::PurgeQueue(
60 CephContext *cct_,
61 mds_rank_t rank_,
62 const int64_t metadata_pool_,
63 Objecter *objecter_,
64 Context *on_error_)
65 :
66 cct(cct_),
67 rank(rank_),
68 lock("PurgeQueue"),
69 metadata_pool(metadata_pool_),
70 finisher(cct, "PurgeQueue", "PQ_Finisher"),
71 timer(cct, lock),
72 filer(objecter_, &finisher),
73 objecter(objecter_),
74 journaler("pq", MDS_INO_PURGE_QUEUE + rank, metadata_pool,
75 CEPH_FS_ONDISK_MAGIC, objecter_, nullptr, 0,
76 &finisher),
77 on_error(on_error_),
78 ops_in_flight(0),
79 max_purge_ops(0),
80 drain_initial(0),
81 draining(false),
82 delayed_flush(nullptr)
83{
84 assert(cct != nullptr);
85 assert(on_error != nullptr);
86 assert(objecter != nullptr);
87 journaler.set_write_error_handler(on_error);
88}
89
90PurgeQueue::~PurgeQueue()
91{
92 if (logger) {
93 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
94 }
95}
96
97void PurgeQueue::create_logger()
98{
99 PerfCountersBuilder pcb(g_ceph_context,
100 "purge_queue", l_pq_first, l_pq_last);
101 pcb.add_u64(l_pq_executing_ops, "pq_executing_ops", "Purge queue ops in flight");
102 pcb.add_u64(l_pq_executing, "pq_executing", "Purge queue tasks in flight");
103 pcb.add_u64_counter(l_pq_executed, "pq_executed", "Purge queue tasks executed", "purg");
104
105 logger.reset(pcb.create_perf_counters());
106 g_ceph_context->get_perfcounters_collection()->add(logger.get());
107}
108
109void PurgeQueue::init()
110{
111 Mutex::Locker l(lock);
112
113 assert(logger != nullptr);
114
115 finisher.start();
116 timer.init();
117}
118
119void PurgeQueue::shutdown()
120{
121 Mutex::Locker l(lock);
122
123 journaler.shutdown();
124 timer.shutdown();
125 finisher.stop();
126}
127
128void PurgeQueue::open(Context *completion)
129{
130 dout(4) << "opening" << dendl;
131
132 Mutex::Locker l(lock);
133
134 journaler.recover(new FunctionContext([this, completion](int r){
135 if (r == -ENOENT) {
136 dout(1) << "Purge Queue not found, assuming this is an upgrade and "
137 "creating it." << dendl;
138 create(completion);
139 } else if (r == 0) {
140 Mutex::Locker l(lock);
141 dout(4) << "open complete" << dendl;
142
143 // Journaler only guarantees entries before head write_pos have been
144 // fully flushed. Before appending new entries, we need to find and
145 // drop any partial written entry.
146 if (journaler.last_committed.write_pos < journaler.get_write_pos()) {
147 dout(4) << "recovering write_pos" << dendl;
148 journaler.set_read_pos(journaler.last_committed.write_pos);
149 _recover(completion);
150 return;
151 }
152
153 journaler.set_writeable();
154 completion->complete(0);
155 } else {
156 derr << "Error " << r << " loading Journaler" << dendl;
157 on_error->complete(r);
158 }
159 }));
160}
161
162
163void PurgeQueue::_recover(Context *completion)
164{
165 assert(lock.is_locked_by_me());
166
167 // Journaler::is_readable() adjusts write_pos if partial entry is encountered
168 while (1) {
169 if (!journaler.is_readable() &&
170 !journaler.get_error() &&
171 journaler.get_read_pos() < journaler.get_write_pos()) {
172 journaler.wait_for_readable(new FunctionContext([this, completion](int r) {
173 Mutex::Locker l(lock);
174 _recover(completion);
175 }));
176 return;
177 }
178
179 if (journaler.get_error()) {
180 int r = journaler.get_error();
181 derr << "Error " << r << " recovering write_pos" << dendl;
182 on_error->complete(r);
183 return;
184 }
185
186 if (journaler.get_read_pos() == journaler.get_write_pos()) {
187 dout(4) << "write_pos recovered" << dendl;
188 // restore original read_pos
189 journaler.set_read_pos(journaler.last_committed.expire_pos);
190 journaler.set_writeable();
191 completion->complete(0);
192 return;
193 }
194
195 bufferlist bl;
196 bool readable = journaler.try_read_entry(bl);
197 assert(readable); // we checked earlier
198 }
199}
200
201void PurgeQueue::create(Context *fin)
202{
203 dout(4) << "creating" << dendl;
204 Mutex::Locker l(lock);
205
206 file_layout_t layout = file_layout_t::get_default();
207 layout.pool_id = metadata_pool;
208 journaler.set_writeable();
209 journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
210 journaler.write_head(fin);
211}
212
213/**
214 * The `completion` context will always be called back via a Finisher
215 */
216void PurgeQueue::push(const PurgeItem &pi, Context *completion)
217{
218 dout(4) << "pushing inode 0x" << std::hex << pi.ino << std::dec << dendl;
219 Mutex::Locker l(lock);
220
221 // Callers should have waited for open() before using us
222 assert(!journaler.is_readonly());
223
224 bufferlist bl;
225
226 ::encode(pi, bl);
227 journaler.append_entry(bl);
228 journaler.wait_for_flush(completion);
229
230 // Maybe go ahead and do something with it right away
231 bool could_consume = _consume();
232 if (!could_consume) {
233 // Usually, it is not necessary to explicitly flush here, because the reader
234 // will get flushes generated inside Journaler::is_readable. However,
235 // if we remain in a can_consume()==false state for a long period then
236 // we should flush in order to allow MDCache to drop its strays rather
237 // than having them wait for purgequeue to progress.
238 if (!delayed_flush) {
239 delayed_flush = new FunctionContext([this](int r){
240 delayed_flush = nullptr;
241 journaler.flush();
242 });
243
244 timer.add_event_after(
245 g_conf->mds_purge_queue_busy_flush_period,
246 delayed_flush);
247 }
248 }
249}
250
251uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const
252{
253 uint32_t ops_required = 0;
254 if (item.action == PurgeItem::PURGE_DIR) {
255 // Directory, count dirfrags to be deleted
256 std::list<frag_t> ls;
257 if (!item.fragtree.is_leaf(frag_t())) {
258 item.fragtree.get_leaves(ls);
259 }
260 // One for the root, plus any leaves
261 ops_required = 1 + ls.size();
262 } else {
263 // File, work out concurrent Filer::purge deletes
264 const uint64_t num = (item.size > 0) ?
265 Striper::get_num_objects(item.layout, item.size) : 1;
266
267 ops_required = MIN(num, g_conf->filer_max_purge_ops);
268
269 // Account for removing (or zeroing) backtrace
270 ops_required += 1;
271
272 // Account for deletions for old pools
273 if (item.action != PurgeItem::TRUNCATE_FILE) {
274 ops_required += item.old_pools.size();
275 }
276 }
277
278 return ops_required;
279}
280
281bool PurgeQueue::can_consume()
282{
283 dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, "
284 << in_flight.size() << "/" << g_conf->mds_max_purge_files
285 << " files" << dendl;
286
287 if (in_flight.size() == 0 && cct->_conf->mds_max_purge_files > 0) {
288 // Always permit consumption if nothing is in flight, so that the ops
289 // limit can never be so low as to forbid all progress (unless
290 // administrator has deliberately paused purging by setting max
291 // purge files to zero).
292 return true;
293 }
294
295 if (ops_in_flight >= max_purge_ops) {
296 dout(20) << "Throttling on op limit " << ops_in_flight << "/"
297 << max_purge_ops << dendl;
298 return false;
299 }
300
301 if (in_flight.size() >= cct->_conf->mds_max_purge_files) {
302 dout(20) << "Throttling on item limit " << in_flight.size()
303 << "/" << cct->_conf->mds_max_purge_files << dendl;
304 return false;
305 } else {
306 return true;
307 }
308}
309
310bool PurgeQueue::_consume()
311{
312 assert(lock.is_locked_by_me());
313
314 bool could_consume = false;
315 while(can_consume()) {
316 could_consume = true;
317
318 if (delayed_flush) {
319 // We are now going to read from the journal, so any proactive
320 // flush is no longer necessary. This is not functionally necessary
321 // but it can avoid generating extra fragmented flush IOs.
322 timer.cancel_event(delayed_flush);
323 delayed_flush = nullptr;
324 }
325
326 if (!journaler.is_readable()) {
327 dout(10) << " not readable right now" << dendl;
328 // Because we are the writer and the reader of the journal
329 // via the same Journaler instance, we never need to reread_head
330 if (!journaler.have_waiter()) {
331 journaler.wait_for_readable(new FunctionContext([this](int r) {
332 Mutex::Locker l(lock);
333 if (r == 0) {
334 _consume();
335 }
336 }));
337 }
338
339 return could_consume;
340 }
341
342 // The journaler is readable: consume an entry
343 bufferlist bl;
344 bool readable = journaler.try_read_entry(bl);
345 assert(readable); // we checked earlier
346
347 dout(20) << " decoding entry" << dendl;
348 PurgeItem item;
349 bufferlist::iterator q = bl.begin();
350 try {
351 ::decode(item, q);
352 } catch (const buffer::error &err) {
353 derr << "Decode error at read_pos=0x" << std::hex
354 << journaler.get_read_pos() << dendl;
355 on_error->complete(0);
356 }
357 dout(20) << " executing item (0x" << std::hex << item.ino
358 << std::dec << ")" << dendl;
359 _execute_item(item, journaler.get_read_pos());
360 }
361
362 dout(10) << " cannot consume right now" << dendl;
363
364 return could_consume;
365}
366
367void PurgeQueue::_execute_item(
368 const PurgeItem &item,
369 uint64_t expire_to)
370{
371 assert(lock.is_locked_by_me());
372
373 in_flight[expire_to] = item;
374 logger->set(l_pq_executing, in_flight.size());
375 ops_in_flight += _calculate_ops(item);
376 logger->set(l_pq_executing_ops, ops_in_flight);
377
378 SnapContext nullsnapc;
379
380 C_GatherBuilder gather(cct);
381 if (item.action == PurgeItem::PURGE_FILE) {
382 if (item.size > 0) {
383 uint64_t num = Striper::get_num_objects(item.layout, item.size);
384 dout(10) << " 0~" << item.size << " objects 0~" << num
385 << " snapc " << item.snapc << " on " << item.ino << dendl;
386 filer.purge_range(item.ino, &item.layout, item.snapc,
387 0, num, ceph::real_clock::now(), 0,
388 gather.new_sub());
389 }
390
391 // remove the backtrace object if it was not purged
392 object_t oid = CInode::get_object_name(item.ino, frag_t(), "");
393 if (!gather.has_subs() || !item.layout.pool_ns.empty()) {
394 object_locator_t oloc(item.layout.pool_id);
395 dout(10) << " remove backtrace object " << oid
396 << " pool " << oloc.pool << " snapc " << item.snapc << dendl;
397 objecter->remove(oid, oloc, item.snapc,
398 ceph::real_clock::now(), 0,
399 gather.new_sub());
400 }
401
402 // remove old backtrace objects
403 for (const auto &p : item.old_pools) {
404 object_locator_t oloc(p);
405 dout(10) << " remove backtrace object " << oid
406 << " old pool " << p << " snapc " << item.snapc << dendl;
407 objecter->remove(oid, oloc, item.snapc,
408 ceph::real_clock::now(), 0,
409 gather.new_sub());
410 }
411 } else if (item.action == PurgeItem::PURGE_DIR) {
412 object_locator_t oloc(metadata_pool);
413 std::list<frag_t> frags;
414 if (!item.fragtree.is_leaf(frag_t()))
415 item.fragtree.get_leaves(frags);
416 frags.push_back(frag_t());
417 for (const auto &frag : frags) {
418 object_t oid = CInode::get_object_name(item.ino, frag, "");
419 dout(10) << " remove dirfrag " << oid << dendl;
420 objecter->remove(oid, oloc, nullsnapc,
421 ceph::real_clock::now(),
422 0, gather.new_sub());
423 }
424 } else if (item.action == PurgeItem::TRUNCATE_FILE) {
425 const uint64_t num = Striper::get_num_objects(item.layout, item.size);
426 dout(10) << " 0~" << item.size << " objects 0~" << num
427 << " snapc " << item.snapc << " on " << item.ino << dendl;
428
429 // keep backtrace object
430 if (num > 1) {
431 filer.purge_range(item.ino, &item.layout, item.snapc,
432 1, num - 1, ceph::real_clock::now(),
433 0, gather.new_sub());
434 }
435 filer.zero(item.ino, &item.layout, item.snapc,
436 0, item.layout.object_size,
437 ceph::real_clock::now(),
438 0, true, gather.new_sub());
439 } else {
440 derr << "Invalid item (action=" << item.action << ") in purge queue, "
441 "dropping it" << dendl;
442 in_flight.erase(expire_to);
443 logger->set(l_pq_executing, in_flight.size());
444 return;
445 }
446 assert(gather.has_subs());
447
448 gather.set_finisher(new FunctionContext([this, expire_to](int r){
449 if (lock.is_locked_by_me()) {
450 // Fast completion, Objecter ops completed before we hit gather.activate()
451 // and we're being called inline. We are still inside _consume so
452 // no need to call back into it.
453 _execute_item_complete(expire_to);
454 } else {
455 // Normal completion, we're being called back from outside PurgeQueue::lock
456 // by the Objecter. Take the lock, and call back into _consume to
457 // find more work.
458 Mutex::Locker l(lock);
459 _execute_item_complete(expire_to);
460
461 _consume();
462 }
463
464 // Have we gone idle? If so, do an extra write_head now instead of
465 // waiting for next flush after journaler_write_head_interval.
466 // Also do this periodically even if not idle, so that the persisted
467 // expire_pos doesn't fall too far behind our progress when consuming
468 // a very long queue.
469 if (in_flight.empty() || journaler.write_head_needed()) {
470 journaler.write_head(new FunctionContext([this](int r){
471 journaler.trim();
472 }));
473 }
474 }));
475 gather.activate();
476}
477
478void PurgeQueue::_execute_item_complete(
479 uint64_t expire_to)
480{
481 assert(lock.is_locked_by_me());
482 dout(10) << "complete at 0x" << std::hex << expire_to << std::dec << dendl;
483 assert(in_flight.count(expire_to) == 1);
484
485 auto iter = in_flight.find(expire_to);
486 assert(iter != in_flight.end());
487 if (iter == in_flight.begin()) {
488 // This was the lowest journal position in flight, so we can now
489 // safely expire the journal up to here.
490 dout(10) << "expiring to 0x" << std::hex << expire_to << std::dec << dendl;
491 journaler.set_expire_pos(expire_to);
492 } else {
493 // This is completely fine, we're not supposed to purge files in
494 // order when doing them in parallel.
495 dout(10) << "non-sequential completion, not expiring anything" << dendl;
496 }
497
498 ops_in_flight -= _calculate_ops(iter->second);
499 logger->set(l_pq_executing_ops, ops_in_flight);
500
501 dout(10) << "completed item for ino 0x" << std::hex << iter->second.ino
502 << std::dec << dendl;
503
504 in_flight.erase(iter);
505 logger->set(l_pq_executing, in_flight.size());
506 dout(10) << "in_flight.size() now " << in_flight.size() << dendl;
507
508 logger->inc(l_pq_executed);
509}
510
511void PurgeQueue::update_op_limit(const MDSMap &mds_map)
512{
513 Mutex::Locker l(lock);
514
515 uint64_t pg_count = 0;
516 objecter->with_osdmap([&](const OSDMap& o) {
517 // Number of PGs across all data pools
518 const std::set<int64_t> &data_pools = mds_map.get_data_pools();
519 for (const auto dp : data_pools) {
520 if (o.get_pg_pool(dp) == NULL) {
521 // It is possible that we have an older OSDMap than MDSMap,
522 // because we don't start watching every OSDMap until after
523 // MDSRank is initialized
524 dout(4) << " data pool " << dp << " not found in OSDMap" << dendl;
525 continue;
526 }
527 pg_count += o.get_pg_num(dp);
528 }
529 });
530
531 // Work out a limit based on n_pgs / n_mdss, multiplied by the user's
532 // preference for how many ops per PG
533 max_purge_ops = uint64_t(((double)pg_count / (double)mds_map.get_max_mds()) *
534 cct->_conf->mds_max_purge_ops_per_pg);
535
536 // User may also specify a hard limit, apply this if so.
537 if (cct->_conf->mds_max_purge_ops) {
538 max_purge_ops = MIN(max_purge_ops, cct->_conf->mds_max_purge_ops);
539 }
540}
541
542void PurgeQueue::handle_conf_change(const struct md_config_t *conf,
543 const std::set <std::string> &changed,
544 const MDSMap &mds_map)
545{
546 if (changed.count("mds_max_purge_ops")
547 || changed.count("mds_max_purge_ops_per_pg")) {
548 update_op_limit(mds_map);
549 } else if (changed.count("mds_max_purge_files")) {
550 Mutex::Locker l(lock);
551
552 if (in_flight.empty()) {
553 // We might have gone from zero to a finite limit, so
554 // might need to kick off consume.
555 dout(4) << "maybe start work again (max_purge_files="
556 << conf->mds_max_purge_files << dendl;
557 finisher.queue(new FunctionContext([this](int r){
558 Mutex::Locker l(lock);
559 _consume();
560 }));
561 }
562 }
563}
564
565bool PurgeQueue::drain(
566 uint64_t *progress,
567 uint64_t *progress_total,
568 size_t *in_flight_count
569 )
570{
571 assert(progress != nullptr);
572 assert(progress_total != nullptr);
573 assert(in_flight_count != nullptr);
574
575 const bool done = in_flight.empty() && (
576 journaler.get_read_pos() == journaler.get_write_pos());
577 if (done) {
578 return true;
579 }
580
581 const uint64_t bytes_remaining = journaler.get_write_pos()
582 - journaler.get_read_pos();
583
584 if (!draining) {
585 // Start of draining: remember how much there was outstanding at
586 // this point so that we can give a progress percentage later
587 draining = true;
588
589 // Life the op throttle as this daemon now has nothing to do but
590 // drain the purge queue, so do it as fast as we can.
591 max_purge_ops = 0xffff;
592 }
593
594 drain_initial = max(bytes_remaining, drain_initial);
595
596 *progress = drain_initial - bytes_remaining;
597 *progress_total = drain_initial;
598 *in_flight_count = in_flight.size();
599
600 return false;
601}
602