]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2015 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "common/debug.h" | |
16 | #include "mds/mdstypes.h" | |
17 | #include "mds/CInode.h" | |
18 | #include "mds/MDCache.h" | |
19 | ||
20 | #include "PurgeQueue.h" | |
21 | ||
22 | ||
23 | #define dout_context cct | |
24 | #define dout_subsys ceph_subsys_mds | |
25 | #undef dout_prefix | |
26 | #define dout_prefix _prefix(_dout, rank) << __func__ << ": " | |
27 | static ostream& _prefix(std::ostream *_dout, mds_rank_t rank) { | |
28 | return *_dout << "mds." << rank << ".purge_queue "; | |
29 | } | |
30 | ||
31 | void PurgeItem::encode(bufferlist &bl) const | |
32 | { | |
33 | ENCODE_START(1, 1, bl); | |
34 | ::encode((uint8_t)action, bl); | |
35 | ::encode(ino, bl); | |
36 | ::encode(size, bl); | |
37 | ::encode(layout, bl, CEPH_FEATURE_FS_FILE_LAYOUT_V2); | |
38 | ::encode(old_pools, bl); | |
39 | ::encode(snapc, bl); | |
40 | ::encode(fragtree, bl); | |
41 | ENCODE_FINISH(bl); | |
42 | } | |
43 | ||
44 | void PurgeItem::decode(bufferlist::iterator &p) | |
45 | { | |
46 | DECODE_START(1, p); | |
47 | ::decode((uint8_t&)action, p); | |
48 | ::decode(ino, p); | |
49 | ::decode(size, p); | |
50 | ::decode(layout, p); | |
51 | ::decode(old_pools, p); | |
52 | ::decode(snapc, p); | |
53 | ::decode(fragtree, p); | |
54 | DECODE_FINISH(p); | |
55 | } | |
56 | ||
57 | // TODO: if Objecter has any slow requests, take that as a hint and | |
58 | // slow down our rate of purging (keep accepting pushes though) | |
59 | PurgeQueue::PurgeQueue( | |
60 | CephContext *cct_, | |
61 | mds_rank_t rank_, | |
62 | const int64_t metadata_pool_, | |
63 | Objecter *objecter_, | |
64 | Context *on_error_) | |
65 | : | |
66 | cct(cct_), | |
67 | rank(rank_), | |
68 | lock("PurgeQueue"), | |
69 | metadata_pool(metadata_pool_), | |
70 | finisher(cct, "PurgeQueue", "PQ_Finisher"), | |
71 | timer(cct, lock), | |
72 | filer(objecter_, &finisher), | |
73 | objecter(objecter_), | |
74 | journaler("pq", MDS_INO_PURGE_QUEUE + rank, metadata_pool, | |
75 | CEPH_FS_ONDISK_MAGIC, objecter_, nullptr, 0, | |
76 | &finisher), | |
77 | on_error(on_error_), | |
78 | ops_in_flight(0), | |
79 | max_purge_ops(0), | |
80 | drain_initial(0), | |
81 | draining(false), | |
3efd9988 FG |
82 | delayed_flush(nullptr), |
83 | recovered(false) | |
7c673cae FG |
84 | { |
85 | assert(cct != nullptr); | |
86 | assert(on_error != nullptr); | |
87 | assert(objecter != nullptr); | |
88 | journaler.set_write_error_handler(on_error); | |
89 | } | |
90 | ||
91 | PurgeQueue::~PurgeQueue() | |
92 | { | |
93 | if (logger) { | |
94 | g_ceph_context->get_perfcounters_collection()->remove(logger.get()); | |
95 | } | |
96 | } | |
97 | ||
98 | void PurgeQueue::create_logger() | |
99 | { | |
91327a77 AA |
100 | PerfCountersBuilder pcb(g_ceph_context, "purge_queue", l_pq_first, l_pq_last); |
101 | ||
102 | pcb.add_u64_counter(l_pq_executed, "pq_executed", "Purge queue tasks executed", | |
103 | "purg", PerfCountersBuilder::PRIO_INTERESTING); | |
104 | ||
105 | pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); | |
7c673cae FG |
106 | pcb.add_u64(l_pq_executing_ops, "pq_executing_ops", "Purge queue ops in flight"); |
107 | pcb.add_u64(l_pq_executing, "pq_executing", "Purge queue tasks in flight"); | |
7c673cae FG |
108 | |
109 | logger.reset(pcb.create_perf_counters()); | |
110 | g_ceph_context->get_perfcounters_collection()->add(logger.get()); | |
111 | } | |
112 | ||
113 | void PurgeQueue::init() | |
114 | { | |
115 | Mutex::Locker l(lock); | |
116 | ||
117 | assert(logger != nullptr); | |
118 | ||
119 | finisher.start(); | |
120 | timer.init(); | |
121 | } | |
122 | ||
c07f9fc5 FG |
123 | void PurgeQueue::activate() |
124 | { | |
125 | Mutex::Locker l(lock); | |
126 | if (journaler.get_read_pos() == journaler.get_write_pos()) | |
127 | return; | |
128 | ||
129 | if (in_flight.empty()) { | |
130 | dout(4) << "start work (by drain)" << dendl; | |
131 | finisher.queue(new FunctionContext([this](int r) { | |
132 | Mutex::Locker l(lock); | |
133 | _consume(); | |
134 | })); | |
135 | } | |
136 | } | |
137 | ||
7c673cae FG |
138 | void PurgeQueue::shutdown() |
139 | { | |
140 | Mutex::Locker l(lock); | |
141 | ||
142 | journaler.shutdown(); | |
143 | timer.shutdown(); | |
144 | finisher.stop(); | |
145 | } | |
146 | ||
147 | void PurgeQueue::open(Context *completion) | |
148 | { | |
149 | dout(4) << "opening" << dendl; | |
150 | ||
151 | Mutex::Locker l(lock); | |
152 | ||
3efd9988 FG |
153 | if (completion) |
154 | waiting_for_recovery.push_back(completion); | |
155 | ||
156 | journaler.recover(new FunctionContext([this](int r){ | |
7c673cae FG |
157 | if (r == -ENOENT) { |
158 | dout(1) << "Purge Queue not found, assuming this is an upgrade and " | |
159 | "creating it." << dendl; | |
3efd9988 | 160 | create(NULL); |
7c673cae FG |
161 | } else if (r == 0) { |
162 | Mutex::Locker l(lock); | |
163 | dout(4) << "open complete" << dendl; | |
164 | ||
165 | // Journaler only guarantees entries before head write_pos have been | |
166 | // fully flushed. Before appending new entries, we need to find and | |
167 | // drop any partial written entry. | |
168 | if (journaler.last_committed.write_pos < journaler.get_write_pos()) { | |
169 | dout(4) << "recovering write_pos" << dendl; | |
170 | journaler.set_read_pos(journaler.last_committed.write_pos); | |
3efd9988 | 171 | _recover(); |
7c673cae FG |
172 | return; |
173 | } | |
174 | ||
175 | journaler.set_writeable(); | |
3efd9988 FG |
176 | recovered = true; |
177 | finish_contexts(g_ceph_context, waiting_for_recovery); | |
7c673cae FG |
178 | } else { |
179 | derr << "Error " << r << " loading Journaler" << dendl; | |
180 | on_error->complete(r); | |
181 | } | |
182 | })); | |
183 | } | |
184 | ||
3efd9988 FG |
185 | void PurgeQueue::wait_for_recovery(Context* c) |
186 | { | |
187 | Mutex::Locker l(lock); | |
188 | if (recovered) | |
189 | c->complete(0); | |
190 | else | |
191 | waiting_for_recovery.push_back(c); | |
192 | } | |
7c673cae | 193 | |
3efd9988 | 194 | void PurgeQueue::_recover() |
7c673cae FG |
195 | { |
196 | assert(lock.is_locked_by_me()); | |
197 | ||
198 | // Journaler::is_readable() adjusts write_pos if partial entry is encountered | |
199 | while (1) { | |
200 | if (!journaler.is_readable() && | |
201 | !journaler.get_error() && | |
202 | journaler.get_read_pos() < journaler.get_write_pos()) { | |
3efd9988 | 203 | journaler.wait_for_readable(new FunctionContext([this](int r) { |
7c673cae | 204 | Mutex::Locker l(lock); |
3efd9988 | 205 | _recover(); |
7c673cae FG |
206 | })); |
207 | return; | |
208 | } | |
209 | ||
210 | if (journaler.get_error()) { | |
211 | int r = journaler.get_error(); | |
212 | derr << "Error " << r << " recovering write_pos" << dendl; | |
213 | on_error->complete(r); | |
214 | return; | |
215 | } | |
216 | ||
217 | if (journaler.get_read_pos() == journaler.get_write_pos()) { | |
218 | dout(4) << "write_pos recovered" << dendl; | |
219 | // restore original read_pos | |
220 | journaler.set_read_pos(journaler.last_committed.expire_pos); | |
221 | journaler.set_writeable(); | |
3efd9988 FG |
222 | recovered = true; |
223 | finish_contexts(g_ceph_context, waiting_for_recovery); | |
7c673cae FG |
224 | return; |
225 | } | |
226 | ||
227 | bufferlist bl; | |
228 | bool readable = journaler.try_read_entry(bl); | |
229 | assert(readable); // we checked earlier | |
230 | } | |
231 | } | |
232 | ||
233 | void PurgeQueue::create(Context *fin) | |
234 | { | |
235 | dout(4) << "creating" << dendl; | |
236 | Mutex::Locker l(lock); | |
237 | ||
3efd9988 FG |
238 | if (fin) |
239 | waiting_for_recovery.push_back(fin); | |
240 | ||
7c673cae FG |
241 | file_layout_t layout = file_layout_t::get_default(); |
242 | layout.pool_id = metadata_pool; | |
243 | journaler.set_writeable(); | |
244 | journaler.create(&layout, JOURNAL_FORMAT_RESILIENT); | |
3efd9988 FG |
245 | journaler.write_head(new FunctionContext([this](int r) { |
246 | Mutex::Locker l(lock); | |
247 | recovered = true; | |
248 | finish_contexts(g_ceph_context, waiting_for_recovery); | |
249 | })); | |
7c673cae FG |
250 | } |
251 | ||
252 | /** | |
253 | * The `completion` context will always be called back via a Finisher | |
254 | */ | |
255 | void PurgeQueue::push(const PurgeItem &pi, Context *completion) | |
256 | { | |
257 | dout(4) << "pushing inode 0x" << std::hex << pi.ino << std::dec << dendl; | |
258 | Mutex::Locker l(lock); | |
259 | ||
260 | // Callers should have waited for open() before using us | |
261 | assert(!journaler.is_readonly()); | |
262 | ||
263 | bufferlist bl; | |
264 | ||
265 | ::encode(pi, bl); | |
266 | journaler.append_entry(bl); | |
267 | journaler.wait_for_flush(completion); | |
268 | ||
269 | // Maybe go ahead and do something with it right away | |
270 | bool could_consume = _consume(); | |
271 | if (!could_consume) { | |
272 | // Usually, it is not necessary to explicitly flush here, because the reader | |
273 | // will get flushes generated inside Journaler::is_readable. However, | |
274 | // if we remain in a can_consume()==false state for a long period then | |
275 | // we should flush in order to allow MDCache to drop its strays rather | |
276 | // than having them wait for purgequeue to progress. | |
277 | if (!delayed_flush) { | |
278 | delayed_flush = new FunctionContext([this](int r){ | |
279 | delayed_flush = nullptr; | |
280 | journaler.flush(); | |
281 | }); | |
282 | ||
283 | timer.add_event_after( | |
284 | g_conf->mds_purge_queue_busy_flush_period, | |
285 | delayed_flush); | |
286 | } | |
287 | } | |
288 | } | |
289 | ||
290 | uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const | |
291 | { | |
292 | uint32_t ops_required = 0; | |
293 | if (item.action == PurgeItem::PURGE_DIR) { | |
294 | // Directory, count dirfrags to be deleted | |
295 | std::list<frag_t> ls; | |
296 | if (!item.fragtree.is_leaf(frag_t())) { | |
297 | item.fragtree.get_leaves(ls); | |
298 | } | |
299 | // One for the root, plus any leaves | |
300 | ops_required = 1 + ls.size(); | |
301 | } else { | |
302 | // File, work out concurrent Filer::purge deletes | |
303 | const uint64_t num = (item.size > 0) ? | |
304 | Striper::get_num_objects(item.layout, item.size) : 1; | |
305 | ||
306 | ops_required = MIN(num, g_conf->filer_max_purge_ops); | |
307 | ||
308 | // Account for removing (or zeroing) backtrace | |
309 | ops_required += 1; | |
310 | ||
311 | // Account for deletions for old pools | |
312 | if (item.action != PurgeItem::TRUNCATE_FILE) { | |
313 | ops_required += item.old_pools.size(); | |
314 | } | |
315 | } | |
316 | ||
317 | return ops_required; | |
318 | } | |
319 | ||
320 | bool PurgeQueue::can_consume() | |
321 | { | |
322 | dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, " | |
323 | << in_flight.size() << "/" << g_conf->mds_max_purge_files | |
324 | << " files" << dendl; | |
325 | ||
326 | if (in_flight.size() == 0 && cct->_conf->mds_max_purge_files > 0) { | |
327 | // Always permit consumption if nothing is in flight, so that the ops | |
328 | // limit can never be so low as to forbid all progress (unless | |
329 | // administrator has deliberately paused purging by setting max | |
330 | // purge files to zero). | |
331 | return true; | |
332 | } | |
333 | ||
334 | if (ops_in_flight >= max_purge_ops) { | |
335 | dout(20) << "Throttling on op limit " << ops_in_flight << "/" | |
336 | << max_purge_ops << dendl; | |
337 | return false; | |
338 | } | |
339 | ||
340 | if (in_flight.size() >= cct->_conf->mds_max_purge_files) { | |
341 | dout(20) << "Throttling on item limit " << in_flight.size() | |
342 | << "/" << cct->_conf->mds_max_purge_files << dendl; | |
343 | return false; | |
344 | } else { | |
345 | return true; | |
346 | } | |
347 | } | |
348 | ||
349 | bool PurgeQueue::_consume() | |
350 | { | |
351 | assert(lock.is_locked_by_me()); | |
352 | ||
353 | bool could_consume = false; | |
354 | while(can_consume()) { | |
7c673cae FG |
355 | |
356 | if (delayed_flush) { | |
357 | // We are now going to read from the journal, so any proactive | |
358 | // flush is no longer necessary. This is not functionally necessary | |
359 | // but it can avoid generating extra fragmented flush IOs. | |
360 | timer.cancel_event(delayed_flush); | |
361 | delayed_flush = nullptr; | |
362 | } | |
363 | ||
1adf2230 AA |
364 | if (int r = journaler.get_error()) { |
365 | derr << "Error " << r << " recovering write_pos" << dendl; | |
366 | on_error->complete(r); | |
367 | return could_consume; | |
368 | } | |
369 | ||
7c673cae FG |
370 | if (!journaler.is_readable()) { |
371 | dout(10) << " not readable right now" << dendl; | |
372 | // Because we are the writer and the reader of the journal | |
373 | // via the same Journaler instance, we never need to reread_head | |
374 | if (!journaler.have_waiter()) { | |
375 | journaler.wait_for_readable(new FunctionContext([this](int r) { | |
376 | Mutex::Locker l(lock); | |
377 | if (r == 0) { | |
378 | _consume(); | |
1adf2230 AA |
379 | } else if (r != -EAGAIN) { |
380 | on_error->complete(r); | |
7c673cae FG |
381 | } |
382 | })); | |
383 | } | |
384 | ||
385 | return could_consume; | |
386 | } | |
387 | ||
28e407b8 | 388 | could_consume = true; |
7c673cae FG |
389 | // The journaler is readable: consume an entry |
390 | bufferlist bl; | |
391 | bool readable = journaler.try_read_entry(bl); | |
392 | assert(readable); // we checked earlier | |
393 | ||
394 | dout(20) << " decoding entry" << dendl; | |
395 | PurgeItem item; | |
396 | bufferlist::iterator q = bl.begin(); | |
397 | try { | |
398 | ::decode(item, q); | |
399 | } catch (const buffer::error &err) { | |
400 | derr << "Decode error at read_pos=0x" << std::hex | |
401 | << journaler.get_read_pos() << dendl; | |
402 | on_error->complete(0); | |
403 | } | |
404 | dout(20) << " executing item (0x" << std::hex << item.ino | |
405 | << std::dec << ")" << dendl; | |
406 | _execute_item(item, journaler.get_read_pos()); | |
407 | } | |
408 | ||
409 | dout(10) << " cannot consume right now" << dendl; | |
410 | ||
411 | return could_consume; | |
412 | } | |
413 | ||
414 | void PurgeQueue::_execute_item( | |
415 | const PurgeItem &item, | |
416 | uint64_t expire_to) | |
417 | { | |
418 | assert(lock.is_locked_by_me()); | |
419 | ||
420 | in_flight[expire_to] = item; | |
421 | logger->set(l_pq_executing, in_flight.size()); | |
422 | ops_in_flight += _calculate_ops(item); | |
423 | logger->set(l_pq_executing_ops, ops_in_flight); | |
424 | ||
425 | SnapContext nullsnapc; | |
426 | ||
427 | C_GatherBuilder gather(cct); | |
428 | if (item.action == PurgeItem::PURGE_FILE) { | |
429 | if (item.size > 0) { | |
430 | uint64_t num = Striper::get_num_objects(item.layout, item.size); | |
431 | dout(10) << " 0~" << item.size << " objects 0~" << num | |
432 | << " snapc " << item.snapc << " on " << item.ino << dendl; | |
433 | filer.purge_range(item.ino, &item.layout, item.snapc, | |
434 | 0, num, ceph::real_clock::now(), 0, | |
435 | gather.new_sub()); | |
436 | } | |
437 | ||
438 | // remove the backtrace object if it was not purged | |
439 | object_t oid = CInode::get_object_name(item.ino, frag_t(), ""); | |
440 | if (!gather.has_subs() || !item.layout.pool_ns.empty()) { | |
441 | object_locator_t oloc(item.layout.pool_id); | |
442 | dout(10) << " remove backtrace object " << oid | |
443 | << " pool " << oloc.pool << " snapc " << item.snapc << dendl; | |
444 | objecter->remove(oid, oloc, item.snapc, | |
445 | ceph::real_clock::now(), 0, | |
446 | gather.new_sub()); | |
447 | } | |
448 | ||
449 | // remove old backtrace objects | |
450 | for (const auto &p : item.old_pools) { | |
451 | object_locator_t oloc(p); | |
452 | dout(10) << " remove backtrace object " << oid | |
453 | << " old pool " << p << " snapc " << item.snapc << dendl; | |
454 | objecter->remove(oid, oloc, item.snapc, | |
455 | ceph::real_clock::now(), 0, | |
456 | gather.new_sub()); | |
457 | } | |
458 | } else if (item.action == PurgeItem::PURGE_DIR) { | |
459 | object_locator_t oloc(metadata_pool); | |
460 | std::list<frag_t> frags; | |
461 | if (!item.fragtree.is_leaf(frag_t())) | |
462 | item.fragtree.get_leaves(frags); | |
463 | frags.push_back(frag_t()); | |
464 | for (const auto &frag : frags) { | |
465 | object_t oid = CInode::get_object_name(item.ino, frag, ""); | |
466 | dout(10) << " remove dirfrag " << oid << dendl; | |
467 | objecter->remove(oid, oloc, nullsnapc, | |
468 | ceph::real_clock::now(), | |
469 | 0, gather.new_sub()); | |
470 | } | |
471 | } else if (item.action == PurgeItem::TRUNCATE_FILE) { | |
472 | const uint64_t num = Striper::get_num_objects(item.layout, item.size); | |
473 | dout(10) << " 0~" << item.size << " objects 0~" << num | |
474 | << " snapc " << item.snapc << " on " << item.ino << dendl; | |
475 | ||
476 | // keep backtrace object | |
477 | if (num > 1) { | |
478 | filer.purge_range(item.ino, &item.layout, item.snapc, | |
479 | 1, num - 1, ceph::real_clock::now(), | |
480 | 0, gather.new_sub()); | |
481 | } | |
482 | filer.zero(item.ino, &item.layout, item.snapc, | |
483 | 0, item.layout.object_size, | |
484 | ceph::real_clock::now(), | |
485 | 0, true, gather.new_sub()); | |
486 | } else { | |
487 | derr << "Invalid item (action=" << item.action << ") in purge queue, " | |
488 | "dropping it" << dendl; | |
489 | in_flight.erase(expire_to); | |
490 | logger->set(l_pq_executing, in_flight.size()); | |
491 | return; | |
492 | } | |
493 | assert(gather.has_subs()); | |
494 | ||
31f18b77 FG |
495 | gather.set_finisher(new C_OnFinisher( |
496 | new FunctionContext([this, expire_to](int r){ | |
497 | Mutex::Locker l(lock); | |
498 | _execute_item_complete(expire_to); | |
7c673cae | 499 | |
31f18b77 | 500 | _consume(); |
7c673cae FG |
501 | |
502 | // Have we gone idle? If so, do an extra write_head now instead of | |
503 | // waiting for next flush after journaler_write_head_interval. | |
504 | // Also do this periodically even if not idle, so that the persisted | |
505 | // expire_pos doesn't fall too far behind our progress when consuming | |
506 | // a very long queue. | |
507 | if (in_flight.empty() || journaler.write_head_needed()) { | |
508 | journaler.write_head(new FunctionContext([this](int r){ | |
509 | journaler.trim(); | |
510 | })); | |
511 | } | |
31f18b77 FG |
512 | }), &finisher)); |
513 | ||
7c673cae FG |
514 | gather.activate(); |
515 | } | |
516 | ||
517 | void PurgeQueue::_execute_item_complete( | |
518 | uint64_t expire_to) | |
519 | { | |
520 | assert(lock.is_locked_by_me()); | |
521 | dout(10) << "complete at 0x" << std::hex << expire_to << std::dec << dendl; | |
522 | assert(in_flight.count(expire_to) == 1); | |
523 | ||
524 | auto iter = in_flight.find(expire_to); | |
525 | assert(iter != in_flight.end()); | |
526 | if (iter == in_flight.begin()) { | |
527 | // This was the lowest journal position in flight, so we can now | |
528 | // safely expire the journal up to here. | |
529 | dout(10) << "expiring to 0x" << std::hex << expire_to << std::dec << dendl; | |
530 | journaler.set_expire_pos(expire_to); | |
531 | } else { | |
532 | // This is completely fine, we're not supposed to purge files in | |
533 | // order when doing them in parallel. | |
534 | dout(10) << "non-sequential completion, not expiring anything" << dendl; | |
535 | } | |
536 | ||
537 | ops_in_flight -= _calculate_ops(iter->second); | |
538 | logger->set(l_pq_executing_ops, ops_in_flight); | |
539 | ||
540 | dout(10) << "completed item for ino 0x" << std::hex << iter->second.ino | |
541 | << std::dec << dendl; | |
542 | ||
543 | in_flight.erase(iter); | |
544 | logger->set(l_pq_executing, in_flight.size()); | |
545 | dout(10) << "in_flight.size() now " << in_flight.size() << dendl; | |
546 | ||
547 | logger->inc(l_pq_executed); | |
548 | } | |
549 | ||
550 | void PurgeQueue::update_op_limit(const MDSMap &mds_map) | |
551 | { | |
552 | Mutex::Locker l(lock); | |
553 | ||
554 | uint64_t pg_count = 0; | |
555 | objecter->with_osdmap([&](const OSDMap& o) { | |
556 | // Number of PGs across all data pools | |
31f18b77 | 557 | const std::vector<int64_t> &data_pools = mds_map.get_data_pools(); |
7c673cae FG |
558 | for (const auto dp : data_pools) { |
559 | if (o.get_pg_pool(dp) == NULL) { | |
560 | // It is possible that we have an older OSDMap than MDSMap, | |
561 | // because we don't start watching every OSDMap until after | |
562 | // MDSRank is initialized | |
563 | dout(4) << " data pool " << dp << " not found in OSDMap" << dendl; | |
564 | continue; | |
565 | } | |
566 | pg_count += o.get_pg_num(dp); | |
567 | } | |
568 | }); | |
569 | ||
570 | // Work out a limit based on n_pgs / n_mdss, multiplied by the user's | |
571 | // preference for how many ops per PG | |
572 | max_purge_ops = uint64_t(((double)pg_count / (double)mds_map.get_max_mds()) * | |
573 | cct->_conf->mds_max_purge_ops_per_pg); | |
574 | ||
575 | // User may also specify a hard limit, apply this if so. | |
576 | if (cct->_conf->mds_max_purge_ops) { | |
577 | max_purge_ops = MIN(max_purge_ops, cct->_conf->mds_max_purge_ops); | |
578 | } | |
579 | } | |
580 | ||
581 | void PurgeQueue::handle_conf_change(const struct md_config_t *conf, | |
582 | const std::set <std::string> &changed, | |
583 | const MDSMap &mds_map) | |
584 | { | |
585 | if (changed.count("mds_max_purge_ops") | |
586 | || changed.count("mds_max_purge_ops_per_pg")) { | |
587 | update_op_limit(mds_map); | |
588 | } else if (changed.count("mds_max_purge_files")) { | |
589 | Mutex::Locker l(lock); | |
590 | ||
591 | if (in_flight.empty()) { | |
592 | // We might have gone from zero to a finite limit, so | |
593 | // might need to kick off consume. | |
594 | dout(4) << "maybe start work again (max_purge_files=" | |
595 | << conf->mds_max_purge_files << dendl; | |
596 | finisher.queue(new FunctionContext([this](int r){ | |
597 | Mutex::Locker l(lock); | |
598 | _consume(); | |
599 | })); | |
600 | } | |
601 | } | |
602 | } | |
603 | ||
604 | bool PurgeQueue::drain( | |
605 | uint64_t *progress, | |
606 | uint64_t *progress_total, | |
607 | size_t *in_flight_count | |
608 | ) | |
609 | { | |
610 | assert(progress != nullptr); | |
611 | assert(progress_total != nullptr); | |
612 | assert(in_flight_count != nullptr); | |
613 | ||
614 | const bool done = in_flight.empty() && ( | |
615 | journaler.get_read_pos() == journaler.get_write_pos()); | |
616 | if (done) { | |
617 | return true; | |
618 | } | |
619 | ||
620 | const uint64_t bytes_remaining = journaler.get_write_pos() | |
621 | - journaler.get_read_pos(); | |
622 | ||
623 | if (!draining) { | |
624 | // Start of draining: remember how much there was outstanding at | |
625 | // this point so that we can give a progress percentage later | |
626 | draining = true; | |
627 | ||
628 | // Life the op throttle as this daemon now has nothing to do but | |
629 | // drain the purge queue, so do it as fast as we can. | |
630 | max_purge_ops = 0xffff; | |
631 | } | |
632 | ||
633 | drain_initial = max(bytes_remaining, drain_initial); | |
634 | ||
635 | *progress = drain_initial - bytes_remaining; | |
636 | *progress_total = drain_initial; | |
637 | *in_flight_count = in_flight.size(); | |
638 | ||
639 | return false; | |
640 | } | |
641 |