]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | #ifndef CEPH_MDSCACHEOBJECT_H |
2 | #define CEPH_MDSCACHEOBJECT_H | |
3 | ||
7c673cae | 4 | #include <ostream> |
7c673cae FG |
5 | |
6 | #include "common/config.h" | |
181888fb FG |
7 | |
8 | #include "include/Context.h" | |
9 | #include "include/alloc_ptr.h" | |
7c673cae | 10 | #include "include/assert.h" |
181888fb | 11 | #include "include/mempool.h" |
7c673cae FG |
12 | #include "include/types.h" |
13 | #include "include/xlist.h" | |
181888fb | 14 | |
7c673cae FG |
15 | #include "mdstypes.h" |
16 | ||
17 | #define MDS_REF_SET // define me for improved debug output, sanity checking | |
18 | //#define MDS_AUTHPIN_SET // define me for debugging auth pin leaks | |
19 | //#define MDS_VERIFY_FRAGSTAT // do (slow) sanity checking on frags | |
20 | ||
21 | ||
22 | class MLock; | |
23 | class SimpleLock; | |
24 | class MDSCacheObject; | |
25 | class MDSInternalContextBase; | |
26 | ||
27 | /* | |
28 | * for metadata leases to clients | |
29 | */ | |
30 | struct ClientLease { | |
31 | client_t client; | |
32 | MDSCacheObject *parent; | |
33 | ||
34 | ceph_seq_t seq; | |
35 | utime_t ttl; | |
36 | xlist<ClientLease*>::item item_session_lease; // per-session list | |
37 | xlist<ClientLease*>::item item_lease; // global list | |
38 | ||
39 | ClientLease(client_t c, MDSCacheObject *p) : | |
40 | client(c), parent(p), seq(0), | |
41 | item_session_lease(this), | |
42 | item_lease(this) { } | |
43 | }; | |
44 | ||
45 | ||
46 | // print hack | |
47 | struct mdsco_db_line_prefix { | |
48 | MDSCacheObject *object; | |
49 | explicit mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {} | |
50 | }; | |
31f18b77 | 51 | std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o); |
7c673cae FG |
52 | |
53 | // printer | |
31f18b77 | 54 | std::ostream& operator<<(std::ostream& out, const MDSCacheObject &o); |
7c673cae FG |
55 | |
56 | class MDSCacheObject { | |
57 | public: | |
58 | // -- pins -- | |
59 | const static int PIN_REPLICATED = 1000; | |
60 | const static int PIN_DIRTY = 1001; | |
61 | const static int PIN_LOCK = -1002; | |
62 | const static int PIN_REQUEST = -1003; | |
63 | const static int PIN_WAITER = 1004; | |
64 | const static int PIN_DIRTYSCATTERED = -1005; | |
65 | static const int PIN_AUTHPIN = 1006; | |
66 | static const int PIN_PTRWAITER = -1007; | |
67 | const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export | |
68 | static const int PIN_CLIENTLEASE = 1009; | |
69 | static const int PIN_DISCOVERBASE = 1010; | |
70 | ||
71 | const char *generic_pin_name(int p) const { | |
72 | switch (p) { | |
73 | case PIN_REPLICATED: return "replicated"; | |
74 | case PIN_DIRTY: return "dirty"; | |
75 | case PIN_LOCK: return "lock"; | |
76 | case PIN_REQUEST: return "request"; | |
77 | case PIN_WAITER: return "waiter"; | |
78 | case PIN_DIRTYSCATTERED: return "dirtyscattered"; | |
79 | case PIN_AUTHPIN: return "authpin"; | |
80 | case PIN_PTRWAITER: return "ptrwaiter"; | |
81 | case PIN_TEMPEXPORTING: return "tempexporting"; | |
82 | case PIN_CLIENTLEASE: return "clientlease"; | |
83 | case PIN_DISCOVERBASE: return "discoverbase"; | |
84 | default: ceph_abort(); return 0; | |
85 | } | |
86 | } | |
87 | ||
88 | // -- state -- | |
89 | const static int STATE_AUTH = (1<<30); | |
90 | const static int STATE_DIRTY = (1<<29); | |
91 | const static int STATE_NOTIFYREF = (1<<28); // notify dropping ref drop through _put() | |
92 | const static int STATE_REJOINING = (1<<27); // replica has not joined w/ primary copy | |
93 | const static int STATE_REJOINUNDEF = (1<<26); // contents undefined. | |
94 | ||
95 | ||
96 | // -- wait -- | |
97 | const static uint64_t WAIT_ORDERED = (1ull<<61); | |
98 | const static uint64_t WAIT_SINGLEAUTH = (1ull<<60); | |
99 | const static uint64_t WAIT_UNFREEZE = (1ull<<59); // pka AUTHPINNABLE | |
100 | ||
101 | ||
102 | // ============================================ | |
103 | // cons | |
104 | public: | |
105 | MDSCacheObject() : | |
106 | state(0), | |
107 | ref(0), | |
108 | auth_pins(0), nested_auth_pins(0), | |
109 | replica_nonce(0) | |
110 | {} | |
111 | virtual ~MDSCacheObject() {} | |
112 | ||
113 | // printing | |
114 | virtual void print(std::ostream& out) = 0; | |
115 | virtual std::ostream& print_db_line_prefix(std::ostream& out) { | |
116 | return out << "mdscacheobject(" << this << ") "; | |
117 | } | |
118 | ||
119 | // -------------------------------------------- | |
120 | // state | |
121 | protected: | |
122 | __u32 state; // state bits | |
123 | ||
124 | public: | |
125 | unsigned get_state() const { return state; } | |
126 | unsigned state_test(unsigned mask) const { return (state & mask); } | |
127 | void state_clear(unsigned mask) { state &= ~mask; } | |
128 | void state_set(unsigned mask) { state |= mask; } | |
129 | void state_reset(unsigned s) { state = s; } | |
130 | ||
131 | bool is_auth() const { return state_test(STATE_AUTH); } | |
132 | bool is_dirty() const { return state_test(STATE_DIRTY); } | |
133 | bool is_clean() const { return !is_dirty(); } | |
134 | bool is_rejoining() const { return state_test(STATE_REJOINING); } | |
135 | ||
136 | // -------------------------------------------- | |
137 | // authority | |
138 | virtual mds_authority_t authority() const = 0; | |
139 | bool is_ambiguous_auth() const { | |
140 | return authority().second != CDIR_AUTH_UNKNOWN; | |
141 | } | |
142 | ||
143 | // -------------------------------------------- | |
144 | // pins | |
145 | protected: | |
146 | __s32 ref; // reference count | |
147 | #ifdef MDS_REF_SET | |
181888fb | 148 | mempool::mds_co::map<int,int> ref_map; |
7c673cae FG |
149 | #endif |
150 | ||
151 | public: | |
152 | int get_num_ref(int by = -1) const { | |
153 | #ifdef MDS_REF_SET | |
154 | if (by >= 0) { | |
155 | if (ref_map.find(by) == ref_map.end()) { | |
156 | return 0; | |
157 | } else { | |
158 | return ref_map.find(by)->second; | |
159 | } | |
160 | } | |
161 | #endif | |
162 | return ref; | |
163 | } | |
164 | virtual const char *pin_name(int by) const = 0; | |
165 | //bool is_pinned_by(int by) { return ref_set.count(by); } | |
166 | //multiset<int>& get_ref_set() { return ref_set; } | |
167 | ||
168 | virtual void last_put() {} | |
169 | virtual void bad_put(int by) { | |
170 | #ifdef MDS_REF_SET | |
171 | assert(ref_map[by] > 0); | |
172 | #endif | |
173 | assert(ref > 0); | |
174 | } | |
175 | virtual void _put() {} | |
176 | void put(int by) { | |
177 | #ifdef MDS_REF_SET | |
178 | if (ref == 0 || ref_map[by] == 0) { | |
179 | #else | |
180 | if (ref == 0) { | |
181 | #endif | |
182 | bad_put(by); | |
183 | } else { | |
184 | ref--; | |
185 | #ifdef MDS_REF_SET | |
186 | ref_map[by]--; | |
187 | #endif | |
188 | if (ref == 0) | |
189 | last_put(); | |
190 | if (state_test(STATE_NOTIFYREF)) | |
191 | _put(); | |
192 | } | |
193 | } | |
194 | ||
195 | virtual void first_get() {} | |
196 | virtual void bad_get(int by) { | |
197 | #ifdef MDS_REF_SET | |
198 | assert(by < 0 || ref_map[by] == 0); | |
199 | #endif | |
200 | ceph_abort(); | |
201 | } | |
202 | void get(int by) { | |
203 | if (ref == 0) | |
204 | first_get(); | |
205 | ref++; | |
206 | #ifdef MDS_REF_SET | |
207 | if (ref_map.find(by) == ref_map.end()) | |
208 | ref_map[by] = 0; | |
209 | ref_map[by]++; | |
210 | #endif | |
211 | } | |
212 | ||
213 | void print_pin_set(std::ostream& out) const { | |
214 | #ifdef MDS_REF_SET | |
215 | std::map<int, int>::const_iterator it = ref_map.begin(); | |
216 | while (it != ref_map.end()) { | |
217 | out << " " << pin_name(it->first) << "=" << it->second; | |
218 | ++it; | |
219 | } | |
220 | #else | |
221 | out << " nref=" << ref; | |
222 | #endif | |
223 | } | |
224 | ||
225 | protected: | |
226 | int auth_pins; | |
227 | int nested_auth_pins; | |
228 | #ifdef MDS_AUTHPIN_SET | |
181888fb | 229 | mempool::mds_co::multiset<void*> auth_pin_set; |
7c673cae FG |
230 | #endif |
231 | ||
232 | public: | |
233 | bool is_auth_pinned() const { return auth_pins || nested_auth_pins; } | |
234 | int get_num_auth_pins() const { return auth_pins; } | |
235 | int get_num_nested_auth_pins() const { return nested_auth_pins; } | |
236 | ||
237 | void dump_states(Formatter *f) const; | |
238 | void dump(Formatter *f) const; | |
239 | ||
240 | // -------------------------------------------- | |
241 | // auth pins | |
242 | virtual bool can_auth_pin() const = 0; | |
243 | virtual void auth_pin(void *who) = 0; | |
244 | virtual void auth_unpin(void *who) = 0; | |
245 | virtual bool is_frozen() const = 0; | |
246 | virtual bool is_freezing() const = 0; | |
247 | virtual bool is_freezing_or_frozen() const { | |
248 | return is_frozen() || is_freezing(); | |
249 | } | |
250 | ||
251 | ||
252 | // -------------------------------------------- | |
253 | // replication (across mds cluster) | |
254 | protected: | |
255 | unsigned replica_nonce; // [replica] defined on replica | |
181888fb FG |
256 | typedef compact_map<mds_rank_t,unsigned> replica_map_type; |
257 | replica_map_type replica_map; // [auth] mds -> nonce | |
7c673cae FG |
258 | |
259 | public: | |
181888fb FG |
260 | bool is_replicated() const { return !get_replicas().empty(); } |
261 | bool is_replica(mds_rank_t mds) const { return get_replicas().count(mds); } | |
262 | int num_replicas() const { return get_replicas().size(); } | |
7c673cae | 263 | unsigned add_replica(mds_rank_t mds) { |
181888fb FG |
264 | if (get_replicas().count(mds)) |
265 | return ++get_replicas()[mds]; // inc nonce | |
266 | if (get_replicas().empty()) | |
7c673cae | 267 | get(PIN_REPLICATED); |
181888fb | 268 | return get_replicas()[mds] = 1; |
7c673cae FG |
269 | } |
270 | void add_replica(mds_rank_t mds, unsigned nonce) { | |
181888fb | 271 | if (get_replicas().empty()) |
7c673cae | 272 | get(PIN_REPLICATED); |
181888fb | 273 | get_replicas()[mds] = nonce; |
7c673cae FG |
274 | } |
275 | unsigned get_replica_nonce(mds_rank_t mds) { | |
181888fb FG |
276 | assert(get_replicas().count(mds)); |
277 | return get_replicas()[mds]; | |
7c673cae FG |
278 | } |
279 | void remove_replica(mds_rank_t mds) { | |
181888fb FG |
280 | assert(get_replicas().count(mds)); |
281 | get_replicas().erase(mds); | |
282 | if (get_replicas().empty()) { | |
7c673cae | 283 | put(PIN_REPLICATED); |
181888fb | 284 | } |
7c673cae FG |
285 | } |
286 | void clear_replica_map() { | |
181888fb | 287 | if (!get_replicas().empty()) |
7c673cae FG |
288 | put(PIN_REPLICATED); |
289 | replica_map.clear(); | |
290 | } | |
181888fb FG |
291 | replica_map_type& get_replicas() { return replica_map; } |
292 | const replica_map_type& get_replicas() const { return replica_map; } | |
7c673cae | 293 | void list_replicas(std::set<mds_rank_t>& ls) const { |
181888fb FG |
294 | for (const auto &p : get_replicas()) { |
295 | ls.insert(p.first); | |
296 | } | |
7c673cae FG |
297 | } |
298 | ||
299 | unsigned get_replica_nonce() const { return replica_nonce; } | |
300 | void set_replica_nonce(unsigned n) { replica_nonce = n; } | |
301 | ||
302 | ||
303 | // --------------------------------------------- | |
304 | // waiting | |
181888fb FG |
305 | private: |
306 | alloc_ptr<mempool::mds_co::multimap<uint64_t, std::pair<uint64_t, MDSInternalContextBase*>>> waiting; | |
7c673cae FG |
307 | static uint64_t last_wait_seq; |
308 | ||
309 | public: | |
310 | bool is_waiter_for(uint64_t mask, uint64_t min=0) { | |
311 | if (!min) { | |
312 | min = mask; | |
313 | while (min & (min-1)) // if more than one bit is set | |
181888fb | 314 | min &= min-1; // clear LSB |
7c673cae | 315 | } |
181888fb FG |
316 | if (waiting) { |
317 | for (auto p = waiting->lower_bound(min); p != waiting->end(); ++p) { | |
318 | if (p->first & mask) return true; | |
319 | if (p->first > mask) return false; | |
320 | } | |
7c673cae FG |
321 | } |
322 | return false; | |
323 | } | |
324 | virtual void add_waiter(uint64_t mask, MDSInternalContextBase *c) { | |
181888fb | 325 | if (waiting->empty()) |
7c673cae FG |
326 | get(PIN_WAITER); |
327 | ||
328 | uint64_t seq = 0; | |
329 | if (mask & WAIT_ORDERED) { | |
330 | seq = ++last_wait_seq; | |
331 | mask &= ~WAIT_ORDERED; | |
332 | } | |
181888fb | 333 | waiting->insert(pair<uint64_t, pair<uint64_t, MDSInternalContextBase*> >( |
7c673cae FG |
334 | mask, |
335 | pair<uint64_t, MDSInternalContextBase*>(seq, c))); | |
336 | // pdout(10,g_conf->debug_mds) << (mdsco_db_line_prefix(this)) | |
337 | // << "add_waiter " << hex << mask << dec << " " << c | |
338 | // << " on " << *this | |
339 | // << dendl; | |
340 | ||
341 | } | |
181888fb FG |
342 | virtual void take_waiting(uint64_t mask, std::list<MDSInternalContextBase*>& ls) { |
343 | if (!waiting || waiting->empty()) return; | |
7c673cae FG |
344 | |
345 | // process ordered waiters in the same order that they were added. | |
346 | std::map<uint64_t, MDSInternalContextBase*> ordered_waiters; | |
347 | ||
181888fb | 348 | for (auto it = waiting->begin(); it != waiting->end(); ) { |
7c673cae | 349 | if (it->first & mask) { |
181888fb FG |
350 | if (it->second.first > 0) { |
351 | ordered_waiters.insert(it->second); | |
352 | } else { | |
353 | ls.push_back(it->second.second); | |
354 | } | |
7c673cae FG |
355 | // pdout(10,g_conf->debug_mds) << (mdsco_db_line_prefix(this)) |
356 | // << "take_waiting mask " << hex << mask << dec << " took " << it->second | |
357 | // << " tag " << hex << it->first << dec | |
358 | // << " on " << *this | |
359 | // << dendl; | |
181888fb | 360 | waiting->erase(it++); |
7c673cae FG |
361 | } else { |
362 | // pdout(10,g_conf->debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second | |
363 | // << " tag " << hex << it->first << dec | |
364 | // << " on " << *this | |
365 | // << dendl; | |
181888fb | 366 | ++it; |
7c673cae FG |
367 | } |
368 | } | |
181888fb | 369 | for (auto it = ordered_waiters.begin(); it != ordered_waiters.end(); ++it) { |
7c673cae FG |
370 | ls.push_back(it->second); |
371 | } | |
181888fb | 372 | if (waiting->empty()) { |
7c673cae | 373 | put(PIN_WAITER); |
181888fb FG |
374 | waiting.release(); |
375 | } | |
7c673cae FG |
376 | } |
377 | void finish_waiting(uint64_t mask, int result = 0); | |
378 | ||
379 | // --------------------------------------------- | |
380 | // locking | |
381 | // noop unless overloaded. | |
382 | virtual SimpleLock* get_lock(int type) { ceph_abort(); return 0; } | |
383 | virtual void set_object_info(MDSCacheObjectInfo &info) { ceph_abort(); } | |
384 | virtual void encode_lock_state(int type, bufferlist& bl) { ceph_abort(); } | |
385 | virtual void decode_lock_state(int type, bufferlist& bl) { ceph_abort(); } | |
386 | virtual void finish_lock_waiters(int type, uint64_t mask, int r=0) { ceph_abort(); } | |
387 | virtual void add_lock_waiter(int type, uint64_t mask, MDSInternalContextBase *c) { ceph_abort(); } | |
388 | virtual bool is_lock_waiting(int type, uint64_t mask) { ceph_abort(); return false; } | |
389 | ||
390 | virtual void clear_dirty_scattered(int type) { ceph_abort(); } | |
391 | ||
392 | // --------------------------------------------- | |
393 | // ordering | |
394 | virtual bool is_lt(const MDSCacheObject *r) const = 0; | |
395 | struct ptr_lt { | |
396 | bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { | |
397 | return l->is_lt(r); | |
398 | } | |
399 | }; | |
400 | ||
401 | }; | |
402 | ||
403 | inline std::ostream& operator<<(std::ostream& out, MDSCacheObject &o) { | |
404 | o.print(out); | |
405 | return out; | |
406 | } | |
407 | ||
31f18b77 | 408 | inline std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o) { |
7c673cae FG |
409 | o.object->print_db_line_prefix(out); |
410 | return out; | |
411 | } | |
412 | ||
413 | #endif |