]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #ifndef CEPH_OSD_INTERNAL_TYPES_H | |
5 | #define CEPH_OSD_INTERNAL_TYPES_H | |
6 | ||
7 | #include "osd_types.h" | |
8 | #include "OpRequest.h" | |
9 | ||
10 | /* | |
11 | * keep tabs on object modifications that are in flight. | |
12 | * we need to know the projected existence, size, snapset, | |
13 | * etc., because we don't send writes down to disk until after | |
14 | * replicas ack. | |
15 | */ | |
16 | ||
17 | struct SnapSetContext { | |
18 | hobject_t oid; | |
19 | SnapSet snapset; | |
20 | int ref; | |
21 | bool registered : 1; | |
22 | bool exists : 1; | |
23 | ||
24 | explicit SnapSetContext(const hobject_t& o) : | |
25 | oid(o), ref(0), registered(false), exists(true) { } | |
26 | }; | |
27 | ||
28 | struct ObjectContext; | |
29 | ||
30 | struct ObjectState { | |
31 | object_info_t oi; | |
32 | bool exists; ///< the stored object exists (i.e., we will remember the object_info_t) | |
33 | ||
34 | ObjectState() : exists(false) {} | |
35 | ||
36 | ObjectState(const object_info_t &oi_, bool exists_) | |
37 | : oi(oi_), exists(exists_) {} | |
38 | }; | |
39 | ||
40 | typedef ceph::shared_ptr<ObjectContext> ObjectContextRef; | |
41 | ||
42 | struct ObjectContext { | |
43 | ObjectState obs; | |
44 | ||
45 | SnapSetContext *ssc; // may be null | |
46 | ||
47 | Context *destructor_callback; | |
48 | ||
49 | private: | |
50 | Mutex lock; | |
51 | public: | |
52 | Cond cond; | |
53 | int unstable_writes, readers, writers_waiting, readers_waiting; | |
54 | ||
55 | ||
56 | // any entity in obs.oi.watchers MUST be in either watchers or unconnected_watchers. | |
57 | map<pair<uint64_t, entity_name_t>, WatchRef> watchers; | |
58 | ||
59 | // attr cache | |
60 | map<string, bufferlist> attr_cache; | |
61 | ||
62 | struct RWState { | |
63 | enum State { | |
64 | RWNONE, | |
65 | RWREAD, | |
66 | RWWRITE, | |
67 | RWEXCL, | |
68 | }; | |
69 | static const char *get_state_name(State s) { | |
70 | switch (s) { | |
71 | case RWNONE: return "none"; | |
72 | case RWREAD: return "read"; | |
73 | case RWWRITE: return "write"; | |
74 | case RWEXCL: return "excl"; | |
75 | default: return "???"; | |
76 | } | |
77 | } | |
78 | const char *get_state_name() const { | |
79 | return get_state_name(state); | |
80 | } | |
81 | ||
82 | list<OpRequestRef> waiters; ///< ops waiting on state change | |
83 | int count; ///< number of readers or writers | |
84 | ||
85 | State state:4; ///< rw state | |
86 | /// if set, restart backfill when we can get a read lock | |
87 | bool recovery_read_marker:1; | |
88 | /// if set, requeue snaptrim on lock release | |
89 | bool snaptrimmer_write_marker:1; | |
90 | ||
91 | RWState() | |
92 | : count(0), | |
93 | state(RWNONE), | |
94 | recovery_read_marker(false), | |
95 | snaptrimmer_write_marker(false) | |
96 | {} | |
97 | bool get_read(OpRequestRef op) { | |
98 | if (get_read_lock()) { | |
99 | return true; | |
100 | } // else | |
101 | waiters.push_back(op); | |
102 | return false; | |
103 | } | |
104 | /// this function adjusts the counts if necessary | |
105 | bool get_read_lock() { | |
106 | // don't starve anybody! | |
107 | if (!waiters.empty()) { | |
108 | return false; | |
109 | } | |
110 | switch (state) { | |
111 | case RWNONE: | |
112 | assert(count == 0); | |
113 | state = RWREAD; | |
114 | // fall through | |
115 | case RWREAD: | |
116 | count++; | |
117 | return true; | |
118 | case RWWRITE: | |
119 | return false; | |
120 | case RWEXCL: | |
121 | return false; | |
122 | default: | |
123 | assert(0 == "unhandled case"); | |
124 | return false; | |
125 | } | |
126 | } | |
127 | ||
128 | bool get_write(OpRequestRef op, bool greedy=false) { | |
129 | if (get_write_lock(greedy)) { | |
130 | return true; | |
131 | } // else | |
132 | if (op) | |
133 | waiters.push_back(op); | |
134 | return false; | |
135 | } | |
136 | bool get_write_lock(bool greedy=false) { | |
137 | if (!greedy) { | |
138 | // don't starve anybody! | |
139 | if (!waiters.empty() || | |
140 | recovery_read_marker) { | |
141 | return false; | |
142 | } | |
143 | } | |
144 | switch (state) { | |
145 | case RWNONE: | |
146 | assert(count == 0); | |
147 | state = RWWRITE; | |
148 | // fall through | |
149 | case RWWRITE: | |
150 | count++; | |
151 | return true; | |
152 | case RWREAD: | |
153 | return false; | |
154 | case RWEXCL: | |
155 | return false; | |
156 | default: | |
157 | assert(0 == "unhandled case"); | |
158 | return false; | |
159 | } | |
160 | } | |
161 | bool get_excl_lock() { | |
162 | switch (state) { | |
163 | case RWNONE: | |
164 | assert(count == 0); | |
165 | state = RWEXCL; | |
166 | count = 1; | |
167 | return true; | |
168 | case RWWRITE: | |
169 | return false; | |
170 | case RWREAD: | |
171 | return false; | |
172 | case RWEXCL: | |
173 | return false; | |
174 | default: | |
175 | assert(0 == "unhandled case"); | |
176 | return false; | |
177 | } | |
178 | } | |
179 | bool get_excl(OpRequestRef op) { | |
180 | if (get_excl_lock()) { | |
181 | return true; | |
182 | } // else | |
183 | if (op) | |
184 | waiters.push_back(op); | |
185 | return false; | |
186 | } | |
187 | /// same as get_write_lock, but ignore starvation | |
188 | bool take_write_lock() { | |
189 | if (state == RWWRITE) { | |
190 | count++; | |
191 | return true; | |
192 | } | |
193 | return get_write_lock(); | |
194 | } | |
195 | void dec(list<OpRequestRef> *requeue) { | |
196 | assert(count > 0); | |
197 | assert(requeue); | |
198 | count--; | |
199 | if (count == 0) { | |
200 | state = RWNONE; | |
201 | requeue->splice(requeue->end(), waiters); | |
202 | } | |
203 | } | |
204 | void put_read(list<OpRequestRef> *requeue) { | |
205 | assert(state == RWREAD); | |
206 | dec(requeue); | |
207 | } | |
208 | void put_write(list<OpRequestRef> *requeue) { | |
209 | assert(state == RWWRITE); | |
210 | dec(requeue); | |
211 | } | |
212 | void put_excl(list<OpRequestRef> *requeue) { | |
213 | assert(state == RWEXCL); | |
214 | dec(requeue); | |
215 | } | |
216 | bool empty() const { return state == RWNONE; } | |
217 | } rwstate; | |
218 | ||
219 | bool get_read(OpRequestRef op) { | |
220 | return rwstate.get_read(op); | |
221 | } | |
222 | bool get_write(OpRequestRef op) { | |
223 | return rwstate.get_write(op, false); | |
224 | } | |
225 | bool get_excl(OpRequestRef op) { | |
226 | return rwstate.get_excl(op); | |
227 | } | |
228 | bool get_lock_type(OpRequestRef op, RWState::State type) { | |
229 | switch (type) { | |
230 | case RWState::RWWRITE: | |
231 | return get_write(op); | |
232 | case RWState::RWREAD: | |
233 | return get_read(op); | |
234 | case RWState::RWEXCL: | |
235 | return get_excl(op); | |
236 | default: | |
237 | assert(0 == "invalid lock type"); | |
238 | return true; | |
239 | } | |
240 | } | |
241 | bool get_write_greedy(OpRequestRef op) { | |
242 | return rwstate.get_write(op, true); | |
243 | } | |
244 | bool get_snaptrimmer_write(bool mark_if_unsuccessful) { | |
245 | if (rwstate.get_write_lock()) { | |
246 | return true; | |
247 | } else { | |
248 | if (mark_if_unsuccessful) | |
249 | rwstate.snaptrimmer_write_marker = true; | |
250 | return false; | |
251 | } | |
252 | } | |
253 | bool get_recovery_read() { | |
254 | rwstate.recovery_read_marker = true; | |
255 | if (rwstate.get_read_lock()) { | |
256 | return true; | |
257 | } | |
258 | return false; | |
259 | } | |
260 | bool try_get_read_lock() { | |
261 | return rwstate.get_read_lock(); | |
262 | } | |
263 | void drop_recovery_read(list<OpRequestRef> *ls) { | |
264 | assert(rwstate.recovery_read_marker); | |
265 | rwstate.put_read(ls); | |
266 | rwstate.recovery_read_marker = false; | |
267 | } | |
268 | void put_lock_type( | |
269 | ObjectContext::RWState::State type, | |
270 | list<OpRequestRef> *to_wake, | |
271 | bool *requeue_recovery, | |
272 | bool *requeue_snaptrimmer) { | |
273 | switch (type) { | |
274 | case ObjectContext::RWState::RWWRITE: | |
275 | rwstate.put_write(to_wake); | |
276 | break; | |
277 | case ObjectContext::RWState::RWREAD: | |
278 | rwstate.put_read(to_wake); | |
279 | break; | |
280 | case ObjectContext::RWState::RWEXCL: | |
281 | rwstate.put_excl(to_wake); | |
282 | break; | |
283 | default: | |
284 | assert(0 == "invalid lock type"); | |
285 | } | |
286 | if (rwstate.empty() && rwstate.recovery_read_marker) { | |
287 | rwstate.recovery_read_marker = false; | |
288 | *requeue_recovery = true; | |
289 | } | |
290 | if (rwstate.empty() && rwstate.snaptrimmer_write_marker) { | |
291 | rwstate.snaptrimmer_write_marker = false; | |
292 | *requeue_snaptrimmer = true; | |
293 | } | |
294 | } | |
295 | bool is_request_pending() { | |
296 | return (rwstate.count > 0); | |
297 | } | |
298 | ||
299 | ObjectContext() | |
300 | : ssc(NULL), | |
301 | destructor_callback(0), | |
302 | lock("PrimaryLogPG::ObjectContext::lock"), | |
303 | unstable_writes(0), readers(0), writers_waiting(0), readers_waiting(0), | |
304 | blocked(false), requeue_scrub_on_unblock(false) {} | |
305 | ||
306 | ~ObjectContext() { | |
307 | assert(rwstate.empty()); | |
308 | if (destructor_callback) | |
309 | destructor_callback->complete(0); | |
310 | } | |
311 | ||
312 | void start_block() { | |
313 | assert(!blocked); | |
314 | blocked = true; | |
315 | } | |
316 | void stop_block() { | |
317 | assert(blocked); | |
318 | blocked = false; | |
319 | } | |
320 | bool is_blocked() const { | |
321 | return blocked; | |
322 | } | |
323 | ||
324 | // do simple synchronous mutual exclusion, for now. no waitqueues or anything fancy. | |
325 | void ondisk_write_lock() { | |
326 | lock.Lock(); | |
327 | writers_waiting++; | |
328 | while (readers_waiting || readers) | |
329 | cond.Wait(lock); | |
330 | writers_waiting--; | |
331 | unstable_writes++; | |
332 | lock.Unlock(); | |
333 | } | |
334 | void ondisk_write_unlock() { | |
335 | lock.Lock(); | |
336 | assert(unstable_writes > 0); | |
337 | unstable_writes--; | |
338 | if (!unstable_writes && readers_waiting) | |
339 | cond.Signal(); | |
340 | lock.Unlock(); | |
341 | } | |
342 | void ondisk_read_lock() { | |
343 | lock.Lock(); | |
344 | readers_waiting++; | |
345 | while (unstable_writes) | |
346 | cond.Wait(lock); | |
347 | readers_waiting--; | |
348 | readers++; | |
349 | lock.Unlock(); | |
350 | } | |
351 | void ondisk_read_unlock() { | |
352 | lock.Lock(); | |
353 | assert(readers > 0); | |
354 | readers--; | |
355 | if (!readers && writers_waiting) | |
356 | cond.Signal(); | |
357 | lock.Unlock(); | |
358 | } | |
359 | ||
360 | /// in-progress copyfrom ops for this object | |
361 | bool blocked:1; | |
362 | bool requeue_scrub_on_unblock:1; // true if we need to requeue scrub on unblock | |
363 | ||
364 | }; | |
365 | ||
366 | inline ostream& operator<<(ostream& out, const ObjectState& obs) | |
367 | { | |
368 | out << obs.oi.soid; | |
369 | if (!obs.exists) | |
370 | out << "(dne)"; | |
371 | return out; | |
372 | } | |
373 | ||
374 | inline ostream& operator<<(ostream& out, const ObjectContext::RWState& rw) | |
375 | { | |
376 | return out << "rwstate(" << rw.get_state_name() | |
377 | << " n=" << rw.count | |
378 | << " w=" << rw.waiters.size() | |
379 | << ")"; | |
380 | } | |
381 | ||
382 | inline ostream& operator<<(ostream& out, const ObjectContext& obc) | |
383 | { | |
384 | return out << "obc(" << obc.obs << " " << obc.rwstate << ")"; | |
385 | } | |
386 | ||
387 | class ObcLockManager { | |
388 | struct ObjectLockState { | |
389 | ObjectContextRef obc; | |
390 | ObjectContext::RWState::State type; | |
391 | ObjectLockState( | |
392 | ObjectContextRef obc, | |
393 | ObjectContext::RWState::State type) | |
394 | : obc(obc), type(type) {} | |
395 | }; | |
396 | map<hobject_t, ObjectLockState> locks; | |
397 | public: | |
398 | ObcLockManager() = default; | |
399 | ObcLockManager(ObcLockManager &&) = default; | |
400 | ObcLockManager(const ObcLockManager &) = delete; | |
401 | ObcLockManager &operator=(ObcLockManager &&) = default; | |
402 | bool empty() const { | |
403 | return locks.empty(); | |
404 | } | |
405 | bool get_lock_type( | |
406 | ObjectContext::RWState::State type, | |
407 | const hobject_t &hoid, | |
408 | ObjectContextRef obc, | |
409 | OpRequestRef op) { | |
410 | assert(locks.find(hoid) == locks.end()); | |
411 | if (obc->get_lock_type(op, type)) { | |
412 | locks.insert(make_pair(hoid, ObjectLockState(obc, type))); | |
413 | return true; | |
414 | } else { | |
415 | return false; | |
416 | } | |
417 | } | |
418 | /// Get write lock, ignore starvation | |
419 | bool take_write_lock( | |
420 | const hobject_t &hoid, | |
421 | ObjectContextRef obc) { | |
422 | assert(locks.find(hoid) == locks.end()); | |
423 | if (obc->rwstate.take_write_lock()) { | |
424 | locks.insert( | |
425 | make_pair( | |
426 | hoid, ObjectLockState(obc, ObjectContext::RWState::RWWRITE))); | |
427 | return true; | |
428 | } else { | |
429 | return false; | |
430 | } | |
431 | } | |
432 | /// Get write lock for snap trim | |
433 | bool get_snaptrimmer_write( | |
434 | const hobject_t &hoid, | |
435 | ObjectContextRef obc, | |
436 | bool mark_if_unsuccessful) { | |
437 | assert(locks.find(hoid) == locks.end()); | |
438 | if (obc->get_snaptrimmer_write(mark_if_unsuccessful)) { | |
439 | locks.insert( | |
440 | make_pair( | |
441 | hoid, ObjectLockState(obc, ObjectContext::RWState::RWWRITE))); | |
442 | return true; | |
443 | } else { | |
444 | return false; | |
445 | } | |
446 | } | |
447 | /// Get write lock greedy | |
448 | bool get_write_greedy( | |
449 | const hobject_t &hoid, | |
450 | ObjectContextRef obc, | |
451 | OpRequestRef op) { | |
452 | assert(locks.find(hoid) == locks.end()); | |
453 | if (obc->get_write_greedy(op)) { | |
454 | locks.insert( | |
455 | make_pair( | |
456 | hoid, ObjectLockState(obc, ObjectContext::RWState::RWWRITE))); | |
457 | return true; | |
458 | } else { | |
459 | return false; | |
460 | } | |
461 | } | |
462 | ||
463 | /// try get read lock | |
464 | bool try_get_read_lock( | |
465 | const hobject_t &hoid, | |
466 | ObjectContextRef obc) { | |
467 | assert(locks.find(hoid) == locks.end()); | |
468 | if (obc->try_get_read_lock()) { | |
469 | locks.insert( | |
470 | make_pair( | |
471 | hoid, | |
472 | ObjectLockState(obc, ObjectContext::RWState::RWREAD))); | |
473 | return true; | |
474 | } else { | |
475 | return false; | |
476 | } | |
477 | } | |
478 | ||
479 | void put_locks( | |
480 | list<pair<hobject_t, list<OpRequestRef> > > *to_requeue, | |
481 | bool *requeue_recovery, | |
482 | bool *requeue_snaptrimmer) { | |
483 | for (auto p: locks) { | |
484 | list<OpRequestRef> _to_requeue; | |
485 | p.second.obc->put_lock_type( | |
486 | p.second.type, | |
487 | &_to_requeue, | |
488 | requeue_recovery, | |
489 | requeue_snaptrimmer); | |
490 | if (to_requeue) { | |
491 | to_requeue->push_back( | |
492 | make_pair( | |
493 | p.second.obc->obs.oi.soid, | |
494 | std::move(_to_requeue))); | |
495 | } | |
496 | } | |
497 | locks.clear(); | |
498 | } | |
499 | ~ObcLockManager() { | |
500 | assert(locks.empty()); | |
501 | } | |
502 | }; | |
503 | ||
504 | ||
505 | ||
506 | #endif |