]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | #ifndef CEPH_MDSCACHEOBJECT_H |
2 | #define CEPH_MDSCACHEOBJECT_H | |
3 | ||
7c673cae | 4 | #include <ostream> |
7c673cae FG |
5 | |
6 | #include "common/config.h" | |
181888fb FG |
7 | |
8 | #include "include/Context.h" | |
9 | #include "include/alloc_ptr.h" | |
7c673cae | 10 | #include "include/assert.h" |
181888fb | 11 | #include "include/mempool.h" |
7c673cae FG |
12 | #include "include/types.h" |
13 | #include "include/xlist.h" | |
181888fb | 14 | |
7c673cae FG |
15 | #include "mdstypes.h" |
16 | ||
17 | #define MDS_REF_SET // define me for improved debug output, sanity checking | |
18 | //#define MDS_AUTHPIN_SET // define me for debugging auth pin leaks | |
19 | //#define MDS_VERIFY_FRAGSTAT // do (slow) sanity checking on frags | |
20 | ||
21 | ||
22 | class MLock; | |
23 | class SimpleLock; | |
24 | class MDSCacheObject; | |
25 | class MDSInternalContextBase; | |
26 | ||
27 | /* | |
28 | * for metadata leases to clients | |
29 | */ | |
30 | struct ClientLease { | |
31 | client_t client; | |
32 | MDSCacheObject *parent; | |
33 | ||
94b18763 | 34 | ceph_seq_t seq = 0; |
7c673cae FG |
35 | utime_t ttl; |
36 | xlist<ClientLease*>::item item_session_lease; // per-session list | |
37 | xlist<ClientLease*>::item item_lease; // global list | |
38 | ||
39 | ClientLease(client_t c, MDSCacheObject *p) : | |
94b18763 | 40 | client(c), parent(p), |
7c673cae FG |
41 | item_session_lease(this), |
42 | item_lease(this) { } | |
94b18763 | 43 | ClientLease() = delete; |
7c673cae FG |
44 | }; |
45 | ||
46 | ||
47 | // print hack | |
48 | struct mdsco_db_line_prefix { | |
49 | MDSCacheObject *object; | |
50 | explicit mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {} | |
51 | }; | |
31f18b77 | 52 | std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o); |
7c673cae FG |
53 | |
54 | // printer | |
31f18b77 | 55 | std::ostream& operator<<(std::ostream& out, const MDSCacheObject &o); |
7c673cae FG |
56 | |
57 | class MDSCacheObject { | |
58 | public: | |
59 | // -- pins -- | |
60 | const static int PIN_REPLICATED = 1000; | |
61 | const static int PIN_DIRTY = 1001; | |
62 | const static int PIN_LOCK = -1002; | |
63 | const static int PIN_REQUEST = -1003; | |
64 | const static int PIN_WAITER = 1004; | |
65 | const static int PIN_DIRTYSCATTERED = -1005; | |
66 | static const int PIN_AUTHPIN = 1006; | |
67 | static const int PIN_PTRWAITER = -1007; | |
68 | const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export | |
69 | static const int PIN_CLIENTLEASE = 1009; | |
70 | static const int PIN_DISCOVERBASE = 1010; | |
71 | ||
72 | const char *generic_pin_name(int p) const { | |
73 | switch (p) { | |
74 | case PIN_REPLICATED: return "replicated"; | |
75 | case PIN_DIRTY: return "dirty"; | |
76 | case PIN_LOCK: return "lock"; | |
77 | case PIN_REQUEST: return "request"; | |
78 | case PIN_WAITER: return "waiter"; | |
79 | case PIN_DIRTYSCATTERED: return "dirtyscattered"; | |
80 | case PIN_AUTHPIN: return "authpin"; | |
81 | case PIN_PTRWAITER: return "ptrwaiter"; | |
82 | case PIN_TEMPEXPORTING: return "tempexporting"; | |
83 | case PIN_CLIENTLEASE: return "clientlease"; | |
84 | case PIN_DISCOVERBASE: return "discoverbase"; | |
85 | default: ceph_abort(); return 0; | |
86 | } | |
87 | } | |
88 | ||
89 | // -- state -- | |
90 | const static int STATE_AUTH = (1<<30); | |
91 | const static int STATE_DIRTY = (1<<29); | |
92 | const static int STATE_NOTIFYREF = (1<<28); // notify dropping ref drop through _put() | |
93 | const static int STATE_REJOINING = (1<<27); // replica has not joined w/ primary copy | |
94 | const static int STATE_REJOINUNDEF = (1<<26); // contents undefined. | |
95 | ||
96 | ||
97 | // -- wait -- | |
98 | const static uint64_t WAIT_ORDERED = (1ull<<61); | |
99 | const static uint64_t WAIT_SINGLEAUTH = (1ull<<60); | |
100 | const static uint64_t WAIT_UNFREEZE = (1ull<<59); // pka AUTHPINNABLE | |
101 | ||
102 | ||
103 | // ============================================ | |
104 | // cons | |
105 | public: | |
94b18763 | 106 | MDSCacheObject() {} |
7c673cae FG |
107 | virtual ~MDSCacheObject() {} |
108 | ||
109 | // printing | |
110 | virtual void print(std::ostream& out) = 0; | |
111 | virtual std::ostream& print_db_line_prefix(std::ostream& out) { | |
112 | return out << "mdscacheobject(" << this << ") "; | |
113 | } | |
114 | ||
115 | // -------------------------------------------- | |
116 | // state | |
117 | protected: | |
94b18763 | 118 | __u32 state = 0; // state bits |
7c673cae FG |
119 | |
120 | public: | |
121 | unsigned get_state() const { return state; } | |
122 | unsigned state_test(unsigned mask) const { return (state & mask); } | |
123 | void state_clear(unsigned mask) { state &= ~mask; } | |
124 | void state_set(unsigned mask) { state |= mask; } | |
125 | void state_reset(unsigned s) { state = s; } | |
126 | ||
127 | bool is_auth() const { return state_test(STATE_AUTH); } | |
128 | bool is_dirty() const { return state_test(STATE_DIRTY); } | |
129 | bool is_clean() const { return !is_dirty(); } | |
130 | bool is_rejoining() const { return state_test(STATE_REJOINING); } | |
131 | ||
132 | // -------------------------------------------- | |
133 | // authority | |
134 | virtual mds_authority_t authority() const = 0; | |
135 | bool is_ambiguous_auth() const { | |
136 | return authority().second != CDIR_AUTH_UNKNOWN; | |
137 | } | |
138 | ||
139 | // -------------------------------------------- | |
140 | // pins | |
141 | protected: | |
94b18763 | 142 | __s32 ref = 0; // reference count |
7c673cae | 143 | #ifdef MDS_REF_SET |
181888fb | 144 | mempool::mds_co::map<int,int> ref_map; |
7c673cae FG |
145 | #endif |
146 | ||
147 | public: | |
148 | int get_num_ref(int by = -1) const { | |
149 | #ifdef MDS_REF_SET | |
150 | if (by >= 0) { | |
151 | if (ref_map.find(by) == ref_map.end()) { | |
152 | return 0; | |
153 | } else { | |
154 | return ref_map.find(by)->second; | |
155 | } | |
156 | } | |
157 | #endif | |
158 | return ref; | |
159 | } | |
160 | virtual const char *pin_name(int by) const = 0; | |
161 | //bool is_pinned_by(int by) { return ref_set.count(by); } | |
162 | //multiset<int>& get_ref_set() { return ref_set; } | |
163 | ||
164 | virtual void last_put() {} | |
165 | virtual void bad_put(int by) { | |
166 | #ifdef MDS_REF_SET | |
167 | assert(ref_map[by] > 0); | |
168 | #endif | |
169 | assert(ref > 0); | |
170 | } | |
171 | virtual void _put() {} | |
172 | void put(int by) { | |
173 | #ifdef MDS_REF_SET | |
174 | if (ref == 0 || ref_map[by] == 0) { | |
175 | #else | |
176 | if (ref == 0) { | |
177 | #endif | |
178 | bad_put(by); | |
179 | } else { | |
180 | ref--; | |
181 | #ifdef MDS_REF_SET | |
182 | ref_map[by]--; | |
183 | #endif | |
184 | if (ref == 0) | |
185 | last_put(); | |
186 | if (state_test(STATE_NOTIFYREF)) | |
187 | _put(); | |
188 | } | |
189 | } | |
190 | ||
191 | virtual void first_get() {} | |
192 | virtual void bad_get(int by) { | |
193 | #ifdef MDS_REF_SET | |
194 | assert(by < 0 || ref_map[by] == 0); | |
195 | #endif | |
196 | ceph_abort(); | |
197 | } | |
198 | void get(int by) { | |
199 | if (ref == 0) | |
200 | first_get(); | |
201 | ref++; | |
202 | #ifdef MDS_REF_SET | |
203 | if (ref_map.find(by) == ref_map.end()) | |
204 | ref_map[by] = 0; | |
205 | ref_map[by]++; | |
206 | #endif | |
207 | } | |
208 | ||
209 | void print_pin_set(std::ostream& out) const { | |
210 | #ifdef MDS_REF_SET | |
211 | std::map<int, int>::const_iterator it = ref_map.begin(); | |
212 | while (it != ref_map.end()) { | |
213 | out << " " << pin_name(it->first) << "=" << it->second; | |
214 | ++it; | |
215 | } | |
216 | #else | |
217 | out << " nref=" << ref; | |
218 | #endif | |
219 | } | |
220 | ||
221 | protected: | |
94b18763 FG |
222 | int auth_pins = 0; |
223 | int nested_auth_pins = 0; | |
7c673cae | 224 | #ifdef MDS_AUTHPIN_SET |
181888fb | 225 | mempool::mds_co::multiset<void*> auth_pin_set; |
7c673cae FG |
226 | #endif |
227 | ||
228 | public: | |
229 | bool is_auth_pinned() const { return auth_pins || nested_auth_pins; } | |
230 | int get_num_auth_pins() const { return auth_pins; } | |
231 | int get_num_nested_auth_pins() const { return nested_auth_pins; } | |
232 | ||
233 | void dump_states(Formatter *f) const; | |
234 | void dump(Formatter *f) const; | |
235 | ||
236 | // -------------------------------------------- | |
237 | // auth pins | |
238 | virtual bool can_auth_pin() const = 0; | |
239 | virtual void auth_pin(void *who) = 0; | |
240 | virtual void auth_unpin(void *who) = 0; | |
241 | virtual bool is_frozen() const = 0; | |
242 | virtual bool is_freezing() const = 0; | |
243 | virtual bool is_freezing_or_frozen() const { | |
244 | return is_frozen() || is_freezing(); | |
245 | } | |
246 | ||
247 | ||
248 | // -------------------------------------------- | |
249 | // replication (across mds cluster) | |
250 | protected: | |
94b18763 FG |
251 | unsigned replica_nonce = 0; // [replica] defined on replica |
252 | typedef mempool::mds_co::compact_map<mds_rank_t,unsigned> replica_map_type; | |
181888fb | 253 | replica_map_type replica_map; // [auth] mds -> nonce |
7c673cae FG |
254 | |
255 | public: | |
181888fb FG |
256 | bool is_replicated() const { return !get_replicas().empty(); } |
257 | bool is_replica(mds_rank_t mds) const { return get_replicas().count(mds); } | |
258 | int num_replicas() const { return get_replicas().size(); } | |
7c673cae | 259 | unsigned add_replica(mds_rank_t mds) { |
181888fb FG |
260 | if (get_replicas().count(mds)) |
261 | return ++get_replicas()[mds]; // inc nonce | |
262 | if (get_replicas().empty()) | |
7c673cae | 263 | get(PIN_REPLICATED); |
181888fb | 264 | return get_replicas()[mds] = 1; |
7c673cae FG |
265 | } |
266 | void add_replica(mds_rank_t mds, unsigned nonce) { | |
181888fb | 267 | if (get_replicas().empty()) |
7c673cae | 268 | get(PIN_REPLICATED); |
181888fb | 269 | get_replicas()[mds] = nonce; |
7c673cae FG |
270 | } |
271 | unsigned get_replica_nonce(mds_rank_t mds) { | |
181888fb FG |
272 | assert(get_replicas().count(mds)); |
273 | return get_replicas()[mds]; | |
7c673cae FG |
274 | } |
275 | void remove_replica(mds_rank_t mds) { | |
181888fb FG |
276 | assert(get_replicas().count(mds)); |
277 | get_replicas().erase(mds); | |
278 | if (get_replicas().empty()) { | |
7c673cae | 279 | put(PIN_REPLICATED); |
181888fb | 280 | } |
7c673cae FG |
281 | } |
282 | void clear_replica_map() { | |
181888fb | 283 | if (!get_replicas().empty()) |
7c673cae FG |
284 | put(PIN_REPLICATED); |
285 | replica_map.clear(); | |
286 | } | |
181888fb FG |
287 | replica_map_type& get_replicas() { return replica_map; } |
288 | const replica_map_type& get_replicas() const { return replica_map; } | |
7c673cae | 289 | void list_replicas(std::set<mds_rank_t>& ls) const { |
181888fb FG |
290 | for (const auto &p : get_replicas()) { |
291 | ls.insert(p.first); | |
292 | } | |
7c673cae FG |
293 | } |
294 | ||
295 | unsigned get_replica_nonce() const { return replica_nonce; } | |
296 | void set_replica_nonce(unsigned n) { replica_nonce = n; } | |
297 | ||
298 | ||
299 | // --------------------------------------------- | |
300 | // waiting | |
181888fb FG |
301 | private: |
302 | alloc_ptr<mempool::mds_co::multimap<uint64_t, std::pair<uint64_t, MDSInternalContextBase*>>> waiting; | |
7c673cae FG |
303 | static uint64_t last_wait_seq; |
304 | ||
305 | public: | |
306 | bool is_waiter_for(uint64_t mask, uint64_t min=0) { | |
307 | if (!min) { | |
308 | min = mask; | |
309 | while (min & (min-1)) // if more than one bit is set | |
181888fb | 310 | min &= min-1; // clear LSB |
7c673cae | 311 | } |
181888fb FG |
312 | if (waiting) { |
313 | for (auto p = waiting->lower_bound(min); p != waiting->end(); ++p) { | |
314 | if (p->first & mask) return true; | |
315 | if (p->first > mask) return false; | |
316 | } | |
7c673cae FG |
317 | } |
318 | return false; | |
319 | } | |
320 | virtual void add_waiter(uint64_t mask, MDSInternalContextBase *c) { | |
181888fb | 321 | if (waiting->empty()) |
7c673cae FG |
322 | get(PIN_WAITER); |
323 | ||
324 | uint64_t seq = 0; | |
325 | if (mask & WAIT_ORDERED) { | |
326 | seq = ++last_wait_seq; | |
327 | mask &= ~WAIT_ORDERED; | |
328 | } | |
181888fb | 329 | waiting->insert(pair<uint64_t, pair<uint64_t, MDSInternalContextBase*> >( |
7c673cae FG |
330 | mask, |
331 | pair<uint64_t, MDSInternalContextBase*>(seq, c))); | |
332 | // pdout(10,g_conf->debug_mds) << (mdsco_db_line_prefix(this)) | |
333 | // << "add_waiter " << hex << mask << dec << " " << c | |
334 | // << " on " << *this | |
335 | // << dendl; | |
336 | ||
337 | } | |
181888fb FG |
338 | virtual void take_waiting(uint64_t mask, std::list<MDSInternalContextBase*>& ls) { |
339 | if (!waiting || waiting->empty()) return; | |
7c673cae FG |
340 | |
341 | // process ordered waiters in the same order that they were added. | |
342 | std::map<uint64_t, MDSInternalContextBase*> ordered_waiters; | |
343 | ||
181888fb | 344 | for (auto it = waiting->begin(); it != waiting->end(); ) { |
7c673cae | 345 | if (it->first & mask) { |
181888fb FG |
346 | if (it->second.first > 0) { |
347 | ordered_waiters.insert(it->second); | |
348 | } else { | |
349 | ls.push_back(it->second.second); | |
350 | } | |
7c673cae FG |
351 | // pdout(10,g_conf->debug_mds) << (mdsco_db_line_prefix(this)) |
352 | // << "take_waiting mask " << hex << mask << dec << " took " << it->second | |
353 | // << " tag " << hex << it->first << dec | |
354 | // << " on " << *this | |
355 | // << dendl; | |
181888fb | 356 | waiting->erase(it++); |
7c673cae FG |
357 | } else { |
358 | // pdout(10,g_conf->debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second | |
359 | // << " tag " << hex << it->first << dec | |
360 | // << " on " << *this | |
361 | // << dendl; | |
181888fb | 362 | ++it; |
7c673cae FG |
363 | } |
364 | } | |
181888fb | 365 | for (auto it = ordered_waiters.begin(); it != ordered_waiters.end(); ++it) { |
7c673cae FG |
366 | ls.push_back(it->second); |
367 | } | |
181888fb | 368 | if (waiting->empty()) { |
7c673cae | 369 | put(PIN_WAITER); |
28e407b8 | 370 | waiting.reset(); |
181888fb | 371 | } |
7c673cae FG |
372 | } |
373 | void finish_waiting(uint64_t mask, int result = 0); | |
374 | ||
375 | // --------------------------------------------- | |
376 | // locking | |
377 | // noop unless overloaded. | |
378 | virtual SimpleLock* get_lock(int type) { ceph_abort(); return 0; } | |
379 | virtual void set_object_info(MDSCacheObjectInfo &info) { ceph_abort(); } | |
380 | virtual void encode_lock_state(int type, bufferlist& bl) { ceph_abort(); } | |
381 | virtual void decode_lock_state(int type, bufferlist& bl) { ceph_abort(); } | |
382 | virtual void finish_lock_waiters(int type, uint64_t mask, int r=0) { ceph_abort(); } | |
383 | virtual void add_lock_waiter(int type, uint64_t mask, MDSInternalContextBase *c) { ceph_abort(); } | |
384 | virtual bool is_lock_waiting(int type, uint64_t mask) { ceph_abort(); return false; } | |
385 | ||
386 | virtual void clear_dirty_scattered(int type) { ceph_abort(); } | |
387 | ||
388 | // --------------------------------------------- | |
389 | // ordering | |
390 | virtual bool is_lt(const MDSCacheObject *r) const = 0; | |
391 | struct ptr_lt { | |
392 | bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { | |
393 | return l->is_lt(r); | |
394 | } | |
395 | }; | |
396 | ||
397 | }; | |
398 | ||
399 | inline std::ostream& operator<<(std::ostream& out, MDSCacheObject &o) { | |
400 | o.print(out); | |
401 | return out; | |
402 | } | |
403 | ||
31f18b77 | 404 | inline std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o) { |
7c673cae FG |
405 | o.object->print_db_line_prefix(out); |
406 | return out; | |
407 | } | |
408 | ||
409 | #endif |