]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "include/int_types.h" | |
16 | #include "common/errno.h" | |
17 | ||
18 | #include <string> | |
19 | #include <stdio.h> | |
20 | ||
21 | #include "CInode.h" | |
22 | #include "CDir.h" | |
23 | #include "CDentry.h" | |
24 | ||
25 | #include "MDSRank.h" | |
26 | #include "MDCache.h" | |
27 | #include "MDLog.h" | |
28 | #include "Locker.h" | |
29 | #include "Mutation.h" | |
30 | ||
31 | #include "events/EUpdate.h" | |
32 | ||
33 | #include "osdc/Objecter.h" | |
34 | ||
35 | #include "snap.h" | |
36 | ||
37 | #include "LogSegment.h" | |
38 | ||
39 | #include "common/Clock.h" | |
40 | ||
41 | #include "messages/MLock.h" | |
42 | #include "messages/MClientCaps.h" | |
43 | ||
44 | #include "common/config.h" | |
45 | #include "global/global_context.h" | |
46 | #include "include/assert.h" | |
47 | ||
48 | #include "mds/MDSContinuation.h" | |
49 | #include "mds/InoTable.h" | |
50 | ||
51 | #define dout_context g_ceph_context | |
52 | #define dout_subsys ceph_subsys_mds | |
53 | #undef dout_prefix | |
54 | #define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") " | |
55 | ||
56 | ||
57 | class CInodeIOContext : public MDSIOContextBase | |
58 | { | |
59 | protected: | |
60 | CInode *in; | |
61 | MDSRank *get_mds() override {return in->mdcache->mds;} | |
62 | public: | |
63 | explicit CInodeIOContext(CInode *in_) : in(in_) { | |
64 | assert(in != NULL); | |
65 | } | |
66 | }; | |
67 | ||
68 | ||
69 | LockType CInode::versionlock_type(CEPH_LOCK_IVERSION); | |
70 | LockType CInode::authlock_type(CEPH_LOCK_IAUTH); | |
71 | LockType CInode::linklock_type(CEPH_LOCK_ILINK); | |
72 | LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT); | |
73 | LockType CInode::filelock_type(CEPH_LOCK_IFILE); | |
74 | LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR); | |
75 | LockType CInode::snaplock_type(CEPH_LOCK_ISNAP); | |
76 | LockType CInode::nestlock_type(CEPH_LOCK_INEST); | |
77 | LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK); | |
78 | LockType CInode::policylock_type(CEPH_LOCK_IPOLICY); | |
79 | ||
80 | //int cinode_pins[CINODE_NUM_PINS]; // counts | |
81 | ostream& CInode::print_db_line_prefix(ostream& out) | |
82 | { | |
83 | return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "; | |
84 | } | |
85 | ||
86 | /* | |
87 | * write caps and lock ids | |
88 | */ | |
89 | struct cinode_lock_info_t cinode_lock_info[] = { | |
90 | { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR }, | |
91 | { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL }, | |
92 | { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL }, | |
93 | { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL }, | |
94 | }; | |
95 | int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]); | |
96 | ||
97 | ||
98 | ||
99 | ostream& operator<<(ostream& out, const CInode& in) | |
100 | { | |
101 | string path; | |
102 | in.make_path_string(path, true); | |
103 | ||
104 | out << "[inode " << in.inode.ino; | |
105 | out << " [" | |
106 | << (in.is_multiversion() ? "...":"") | |
107 | << in.first << "," << in.last << "]"; | |
108 | out << " " << path << (in.is_dir() ? "/":""); | |
109 | ||
110 | if (in.is_auth()) { | |
111 | out << " auth"; | |
112 | if (in.is_replicated()) | |
113 | out << in.get_replicas(); | |
114 | } else { | |
115 | mds_authority_t a = in.authority(); | |
116 | out << " rep@" << a.first; | |
117 | if (a.second != CDIR_AUTH_UNKNOWN) | |
118 | out << "," << a.second; | |
119 | out << "." << in.get_replica_nonce(); | |
120 | } | |
121 | ||
122 | if (in.is_symlink()) | |
123 | out << " symlink='" << in.symlink << "'"; | |
124 | if (in.is_dir() && !in.dirfragtree.empty()) | |
125 | out << " " << in.dirfragtree; | |
126 | ||
127 | out << " v" << in.get_version(); | |
128 | if (in.get_projected_version() > in.get_version()) | |
129 | out << " pv" << in.get_projected_version(); | |
130 | ||
131 | if (in.is_auth_pinned()) { | |
132 | out << " ap=" << in.get_num_auth_pins() << "+" << in.get_num_nested_auth_pins(); | |
133 | #ifdef MDS_AUTHPIN_SET | |
134 | out << "(" << in.auth_pin_set << ")"; | |
135 | #endif | |
136 | } | |
137 | ||
138 | if (in.snaprealm) | |
139 | out << " snaprealm=" << in.snaprealm; | |
140 | ||
141 | if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; | |
142 | if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover"; | |
143 | if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering"; | |
144 | if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent"; | |
145 | if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " missingobjs"; | |
146 | if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; | |
147 | if (in.is_frozen_inode()) out << " FROZEN"; | |
148 | if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN"; | |
149 | ||
150 | const inode_t *pi = in.get_projected_inode(); | |
151 | if (pi->is_truncating()) | |
152 | out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")"; | |
153 | ||
154 | if (in.inode.is_dir()) { | |
155 | out << " " << in.inode.dirstat; | |
156 | if (g_conf->mds_debug_scatterstat && in.is_projected()) { | |
157 | const inode_t *pi = in.get_projected_inode(); | |
158 | out << "->" << pi->dirstat; | |
159 | } | |
160 | } else { | |
161 | out << " s=" << in.inode.size; | |
162 | if (in.inode.nlink != 1) | |
163 | out << " nl=" << in.inode.nlink; | |
164 | } | |
165 | ||
166 | // rstat | |
167 | out << " " << in.inode.rstat; | |
168 | if (!(in.inode.rstat == in.inode.accounted_rstat)) | |
169 | out << "/" << in.inode.accounted_rstat; | |
170 | if (g_conf->mds_debug_scatterstat && in.is_projected()) { | |
171 | const inode_t *pi = in.get_projected_inode(); | |
172 | out << "->" << pi->rstat; | |
173 | if (!(pi->rstat == pi->accounted_rstat)) | |
174 | out << "/" << pi->accounted_rstat; | |
175 | } | |
176 | ||
177 | if (!in.client_need_snapflush.empty()) | |
178 | out << " need_snapflush=" << in.client_need_snapflush; | |
179 | ||
180 | ||
181 | // locks | |
182 | if (!in.authlock.is_sync_and_unlocked()) | |
183 | out << " " << in.authlock; | |
184 | if (!in.linklock.is_sync_and_unlocked()) | |
185 | out << " " << in.linklock; | |
186 | if (in.inode.is_dir()) { | |
187 | if (!in.dirfragtreelock.is_sync_and_unlocked()) | |
188 | out << " " << in.dirfragtreelock; | |
189 | if (!in.snaplock.is_sync_and_unlocked()) | |
190 | out << " " << in.snaplock; | |
191 | if (!in.nestlock.is_sync_and_unlocked()) | |
192 | out << " " << in.nestlock; | |
193 | if (!in.policylock.is_sync_and_unlocked()) | |
194 | out << " " << in.policylock; | |
195 | } else { | |
196 | if (!in.flocklock.is_sync_and_unlocked()) | |
197 | out << " " << in.flocklock; | |
198 | } | |
199 | if (!in.filelock.is_sync_and_unlocked()) | |
200 | out << " " << in.filelock; | |
201 | if (!in.xattrlock.is_sync_and_unlocked()) | |
202 | out << " " << in.xattrlock; | |
203 | if (!in.versionlock.is_sync_and_unlocked()) | |
204 | out << " " << in.versionlock; | |
205 | ||
206 | // hack: spit out crap on which clients have caps | |
207 | if (in.inode.client_ranges.size()) | |
208 | out << " cr=" << in.inode.client_ranges; | |
209 | ||
210 | if (!in.get_client_caps().empty()) { | |
211 | out << " caps={"; | |
212 | for (map<client_t,Capability*>::const_iterator it = in.get_client_caps().begin(); | |
213 | it != in.get_client_caps().end(); | |
214 | ++it) { | |
215 | if (it != in.get_client_caps().begin()) out << ","; | |
216 | out << it->first << "=" | |
217 | << ccap_string(it->second->pending()); | |
218 | if (it->second->issued() != it->second->pending()) | |
219 | out << "/" << ccap_string(it->second->issued()); | |
220 | out << "/" << ccap_string(it->second->wanted()) | |
221 | << "@" << it->second->get_last_sent(); | |
222 | } | |
223 | out << "}"; | |
224 | if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) { | |
225 | out << ",l=" << in.get_loner(); | |
226 | if (in.get_loner() != in.get_wanted_loner()) | |
227 | out << "(" << in.get_wanted_loner() << ")"; | |
228 | } | |
229 | } | |
230 | if (!in.get_mds_caps_wanted().empty()) { | |
231 | out << " mcw={"; | |
232 | for (compact_map<int,int>::const_iterator p = in.get_mds_caps_wanted().begin(); | |
233 | p != in.get_mds_caps_wanted().end(); | |
234 | ++p) { | |
235 | if (p != in.get_mds_caps_wanted().begin()) | |
236 | out << ','; | |
237 | out << p->first << '=' << ccap_string(p->second); | |
238 | } | |
239 | out << '}'; | |
240 | } | |
241 | ||
242 | if (in.get_num_ref()) { | |
243 | out << " |"; | |
244 | in.print_pin_set(out); | |
245 | } | |
246 | ||
247 | if (in.inode.export_pin != MDS_RANK_NONE) { | |
248 | out << " export_pin=" << in.inode.export_pin; | |
249 | } | |
250 | ||
251 | out << " " << ∈ | |
252 | out << "]"; | |
253 | return out; | |
254 | } | |
255 | ||
256 | ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si) | |
257 | { | |
258 | out << "{scrub_start_version: " << si.scrub_start_version | |
259 | << ", scrub_start_stamp: " << si.scrub_start_stamp | |
260 | << ", last_scrub_version: " << si.last_scrub_version | |
261 | << ", last_scrub_stamp: " << si.last_scrub_stamp; | |
262 | return out; | |
263 | } | |
264 | ||
265 | ||
266 | ||
267 | void CInode::print(ostream& out) | |
268 | { | |
269 | out << *this; | |
270 | } | |
271 | ||
272 | ||
273 | ||
274 | void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) | |
275 | { | |
276 | dout(10) << "add_need_snapflush client." << client << " snapid " << snapid << " on " << snapin << dendl; | |
277 | ||
278 | if (client_need_snapflush.empty()) { | |
279 | get(CInode::PIN_NEEDSNAPFLUSH); | |
280 | ||
281 | // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially | |
282 | // long periods waiting for clients to flush their snaps. | |
283 | auth_pin(this); // pin head inode... | |
284 | } | |
285 | ||
286 | set<client_t>& clients = client_need_snapflush[snapid]; | |
287 | if (clients.empty()) | |
288 | snapin->auth_pin(this); // ...and pin snapped/old inode! | |
289 | ||
290 | clients.insert(client); | |
291 | } | |
292 | ||
293 | void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) | |
294 | { | |
295 | dout(10) << "remove_need_snapflush client." << client << " snapid " << snapid << " on " << snapin << dendl; | |
296 | compact_map<snapid_t, std::set<client_t> >::iterator p = client_need_snapflush.find(snapid); | |
297 | if (p == client_need_snapflush.end()) { | |
298 | dout(10) << " snapid not found" << dendl; | |
299 | return; | |
300 | } | |
301 | if (!p->second.count(client)) { | |
302 | dout(10) << " client not found" << dendl; | |
303 | return; | |
304 | } | |
305 | p->second.erase(client); | |
306 | if (p->second.empty()) { | |
307 | client_need_snapflush.erase(p); | |
308 | snapin->auth_unpin(this); | |
309 | ||
310 | if (client_need_snapflush.empty()) { | |
311 | put(CInode::PIN_NEEDSNAPFLUSH); | |
312 | auth_unpin(this); | |
313 | } | |
314 | } | |
315 | } | |
316 | ||
317 | bool CInode::split_need_snapflush(CInode *cowin, CInode *in) | |
318 | { | |
319 | dout(10) << "split_need_snapflush [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl; | |
320 | bool need_flush = false; | |
321 | for (compact_map<snapid_t, set<client_t> >::iterator p = client_need_snapflush.lower_bound(cowin->first); | |
322 | p != client_need_snapflush.end() && p->first < in->first; ) { | |
323 | compact_map<snapid_t, set<client_t> >::iterator q = p; | |
324 | ++p; | |
325 | assert(!q->second.empty()); | |
326 | if (cowin->last >= q->first) { | |
327 | cowin->auth_pin(this); | |
328 | need_flush = true; | |
329 | } else | |
330 | client_need_snapflush.erase(q); | |
331 | in->auth_unpin(this); | |
332 | } | |
333 | return need_flush; | |
334 | } | |
335 | ||
336 | void CInode::mark_dirty_rstat() | |
337 | { | |
338 | if (!state_test(STATE_DIRTYRSTAT)) { | |
339 | dout(10) << "mark_dirty_rstat" << dendl; | |
340 | state_set(STATE_DIRTYRSTAT); | |
341 | get(PIN_DIRTYRSTAT); | |
224ce89b WB |
342 | CDentry *pdn = get_projected_parent_dn(); |
343 | if (pdn->is_auth()) { | |
344 | CDir *pdir = pdn->dir; | |
345 | pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item); | |
346 | mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock); | |
347 | } else { | |
348 | // under cross-MDS rename. | |
349 | // DIRTYRSTAT flag will get cleared when rename finishes | |
350 | assert(state_test(STATE_AMBIGUOUSAUTH)); | |
351 | } | |
7c673cae FG |
352 | } |
353 | } | |
354 | void CInode::clear_dirty_rstat() | |
355 | { | |
356 | if (state_test(STATE_DIRTYRSTAT)) { | |
357 | dout(10) << "clear_dirty_rstat" << dendl; | |
358 | state_clear(STATE_DIRTYRSTAT); | |
359 | put(PIN_DIRTYRSTAT); | |
360 | dirty_rstat_item.remove_myself(); | |
361 | } | |
362 | } | |
363 | ||
364 | inode_t *CInode::project_inode(map<string,bufferptr> *px) | |
365 | { | |
366 | if (projected_nodes.empty()) { | |
367 | projected_nodes.push_back(new projected_inode_t(new inode_t(inode))); | |
368 | if (px) | |
369 | *px = xattrs; | |
370 | } else { | |
371 | projected_nodes.push_back(new projected_inode_t( | |
372 | new inode_t(*projected_nodes.back()->inode))); | |
373 | if (px) | |
374 | *px = *get_projected_xattrs(); | |
375 | } | |
376 | ||
377 | projected_inode_t &pi = *projected_nodes.back(); | |
378 | ||
379 | if (px) { | |
380 | pi.xattrs = px; | |
381 | ++num_projected_xattrs; | |
382 | } | |
383 | ||
384 | if (scrub_infop && scrub_infop->last_scrub_dirty) { | |
385 | pi.inode->last_scrub_stamp = scrub_infop->last_scrub_stamp; | |
386 | pi.inode->last_scrub_version = scrub_infop->last_scrub_version; | |
387 | scrub_infop->last_scrub_dirty = false; | |
388 | scrub_maybe_delete_info(); | |
389 | } | |
390 | dout(15) << "project_inode " << pi.inode << dendl; | |
391 | return pi.inode; | |
392 | } | |
393 | ||
394 | void CInode::pop_and_dirty_projected_inode(LogSegment *ls) | |
395 | { | |
396 | assert(!projected_nodes.empty()); | |
397 | dout(15) << "pop_and_dirty_projected_inode " << projected_nodes.front()->inode | |
398 | << " v" << projected_nodes.front()->inode->version << dendl; | |
399 | int64_t old_pool = inode.layout.pool_id; | |
400 | ||
401 | mark_dirty(projected_nodes.front()->inode->version, ls); | |
402 | inode = *projected_nodes.front()->inode; | |
403 | ||
404 | if (inode.is_backtrace_updated()) | |
405 | _mark_dirty_parent(ls, old_pool != inode.layout.pool_id); | |
406 | ||
407 | map<string,bufferptr> *px = projected_nodes.front()->xattrs; | |
408 | if (px) { | |
409 | --num_projected_xattrs; | |
410 | xattrs = *px; | |
411 | delete px; | |
412 | } | |
413 | ||
414 | if (projected_nodes.front()->snapnode) { | |
415 | pop_projected_snaprealm(projected_nodes.front()->snapnode); | |
416 | --num_projected_srnodes; | |
417 | } | |
418 | ||
419 | delete projected_nodes.front()->inode; | |
420 | delete projected_nodes.front(); | |
421 | ||
422 | projected_nodes.pop_front(); | |
423 | } | |
424 | ||
425 | sr_t *CInode::project_snaprealm(snapid_t snapid) | |
426 | { | |
427 | sr_t *cur_srnode = get_projected_srnode(); | |
428 | sr_t *new_srnode; | |
429 | ||
430 | if (cur_srnode) { | |
431 | new_srnode = new sr_t(*cur_srnode); | |
432 | } else { | |
433 | new_srnode = new sr_t(); | |
434 | new_srnode->created = snapid; | |
435 | new_srnode->current_parent_since = get_oldest_snap(); | |
436 | } | |
437 | dout(10) << "project_snaprealm " << new_srnode << dendl; | |
438 | projected_nodes.back()->snapnode = new_srnode; | |
439 | ++num_projected_srnodes; | |
440 | return new_srnode; | |
441 | } | |
442 | ||
443 | /* if newparent != parent, add parent to past_parents | |
444 | if parent DNE, we need to find what the parent actually is and fill that in */ | |
445 | void CInode::project_past_snaprealm_parent(SnapRealm *newparent) | |
446 | { | |
447 | sr_t *new_snap = project_snaprealm(); | |
448 | SnapRealm *oldparent; | |
449 | if (!snaprealm) { | |
450 | oldparent = find_snaprealm(); | |
451 | new_snap->seq = oldparent->get_newest_seq(); | |
452 | } | |
453 | else | |
454 | oldparent = snaprealm->parent; | |
455 | ||
456 | if (newparent != oldparent) { | |
457 | snapid_t oldparentseq = oldparent->get_newest_seq(); | |
458 | if (oldparentseq + 1 > new_snap->current_parent_since) { | |
459 | new_snap->past_parents[oldparentseq].ino = oldparent->inode->ino(); | |
460 | new_snap->past_parents[oldparentseq].first = new_snap->current_parent_since; | |
461 | } | |
462 | new_snap->current_parent_since = MAX(oldparentseq, newparent->get_last_created()) + 1; | |
463 | } | |
464 | } | |
465 | ||
466 | void CInode::pop_projected_snaprealm(sr_t *next_snaprealm) | |
467 | { | |
468 | assert(next_snaprealm); | |
469 | dout(10) << "pop_projected_snaprealm " << next_snaprealm | |
470 | << " seq" << next_snaprealm->seq << dendl; | |
471 | bool invalidate_cached_snaps = false; | |
472 | if (!snaprealm) { | |
473 | open_snaprealm(); | |
474 | } else if (next_snaprealm->past_parents.size() != | |
475 | snaprealm->srnode.past_parents.size()) { | |
476 | invalidate_cached_snaps = true; | |
477 | // re-open past parents | |
478 | snaprealm->_close_parents(); | |
479 | ||
480 | dout(10) << " realm " << *snaprealm << " past_parents " << snaprealm->srnode.past_parents | |
481 | << " -> " << next_snaprealm->past_parents << dendl; | |
482 | } | |
483 | snaprealm->srnode = *next_snaprealm; | |
484 | delete next_snaprealm; | |
485 | ||
486 | // we should be able to open these up (or have them already be open). | |
487 | bool ok = snaprealm->_open_parents(NULL); | |
488 | assert(ok); | |
489 | ||
490 | if (invalidate_cached_snaps) | |
491 | snaprealm->invalidate_cached_snaps(); | |
492 | ||
493 | if (snaprealm->parent) | |
494 | dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl; | |
495 | } | |
496 | ||
497 | ||
498 | // ====== CInode ======= | |
499 | ||
500 | // dirfrags | |
501 | ||
502 | __u32 InodeStoreBase::hash_dentry_name(const string &dn) | |
503 | { | |
504 | int which = inode.dir_layout.dl_dir_hash; | |
505 | if (!which) | |
506 | which = CEPH_STR_HASH_LINUX; | |
507 | assert(ceph_str_hash_valid(which)); | |
508 | return ceph_str_hash(which, dn.data(), dn.length()); | |
509 | } | |
510 | ||
511 | frag_t InodeStoreBase::pick_dirfrag(const string& dn) | |
512 | { | |
513 | if (dirfragtree.empty()) | |
514 | return frag_t(); // avoid the string hash if we can. | |
515 | ||
516 | __u32 h = hash_dentry_name(dn); | |
517 | return dirfragtree[h]; | |
518 | } | |
519 | ||
520 | bool CInode::get_dirfrags_under(frag_t fg, list<CDir*>& ls) | |
521 | { | |
522 | bool all = true; | |
523 | list<frag_t> fglist; | |
524 | dirfragtree.get_leaves_under(fg, fglist); | |
525 | for (list<frag_t>::iterator p = fglist.begin(); p != fglist.end(); ++p) | |
526 | if (dirfrags.count(*p)) | |
527 | ls.push_back(dirfrags[*p]); | |
528 | else | |
529 | all = false; | |
530 | ||
531 | if (all) | |
532 | return all; | |
533 | ||
534 | fragtree_t tmpdft; | |
535 | tmpdft.force_to_leaf(g_ceph_context, fg); | |
536 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) { | |
537 | tmpdft.force_to_leaf(g_ceph_context, p->first); | |
538 | if (fg.contains(p->first) && !dirfragtree.is_leaf(p->first)) | |
539 | ls.push_back(p->second); | |
540 | } | |
541 | ||
542 | all = true; | |
543 | tmpdft.get_leaves_under(fg, fglist); | |
544 | for (list<frag_t>::iterator p = fglist.begin(); p != fglist.end(); ++p) | |
545 | if (!dirfrags.count(*p)) { | |
546 | all = false; | |
547 | break; | |
548 | } | |
549 | ||
550 | return all; | |
551 | } | |
552 | ||
553 | void CInode::verify_dirfrags() | |
554 | { | |
555 | bool bad = false; | |
556 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) { | |
557 | if (!dirfragtree.is_leaf(p->first)) { | |
558 | dout(0) << "have open dirfrag " << p->first << " but not leaf in " << dirfragtree | |
559 | << ": " << *p->second << dendl; | |
560 | bad = true; | |
561 | } | |
562 | } | |
563 | assert(!bad); | |
564 | } | |
565 | ||
566 | void CInode::force_dirfrags() | |
567 | { | |
568 | bool bad = false; | |
569 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); p != dirfrags.end(); ++p) { | |
570 | if (!dirfragtree.is_leaf(p->first)) { | |
571 | dout(0) << "have open dirfrag " << p->first << " but not leaf in " << dirfragtree | |
572 | << ": " << *p->second << dendl; | |
573 | bad = true; | |
574 | } | |
575 | } | |
576 | ||
577 | if (bad) { | |
578 | list<frag_t> leaves; | |
579 | dirfragtree.get_leaves(leaves); | |
580 | for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p) | |
581 | mdcache->get_force_dirfrag(dirfrag_t(ino(),*p), true); | |
582 | } | |
583 | ||
584 | verify_dirfrags(); | |
585 | } | |
586 | ||
587 | CDir *CInode::get_approx_dirfrag(frag_t fg) | |
588 | { | |
589 | CDir *dir = get_dirfrag(fg); | |
590 | if (dir) return dir; | |
591 | ||
592 | // find a child? | |
593 | list<CDir*> ls; | |
594 | get_dirfrags_under(fg, ls); | |
595 | if (!ls.empty()) | |
596 | return ls.front(); | |
597 | ||
598 | // try parents? | |
599 | while (fg.bits() > 0) { | |
600 | fg = fg.parent(); | |
601 | dir = get_dirfrag(fg); | |
602 | if (dir) return dir; | |
603 | } | |
604 | return NULL; | |
605 | } | |
606 | ||
607 | void CInode::get_dirfrags(list<CDir*>& ls) | |
608 | { | |
609 | // all dirfrags | |
610 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
611 | p != dirfrags.end(); | |
612 | ++p) | |
613 | ls.push_back(p->second); | |
614 | } | |
615 | void CInode::get_nested_dirfrags(list<CDir*>& ls) | |
616 | { | |
617 | // dirfrags in same subtree | |
618 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
619 | p != dirfrags.end(); | |
620 | ++p) | |
621 | if (!p->second->is_subtree_root()) | |
622 | ls.push_back(p->second); | |
623 | } | |
624 | void CInode::get_subtree_dirfrags(list<CDir*>& ls) | |
625 | { | |
626 | // dirfrags that are roots of new subtrees | |
627 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
628 | p != dirfrags.end(); | |
629 | ++p) | |
630 | if (p->second->is_subtree_root()) | |
631 | ls.push_back(p->second); | |
632 | } | |
633 | ||
634 | ||
635 | CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) | |
636 | { | |
637 | assert(is_dir()); | |
638 | ||
639 | // have it? | |
640 | CDir *dir = get_dirfrag(fg); | |
641 | if (!dir) { | |
642 | // create it. | |
643 | assert(is_auth() || mdcache->mds->is_any_replay()); | |
644 | dir = new CDir(this, fg, mdcache, is_auth()); | |
645 | add_dirfrag(dir); | |
646 | } | |
647 | return dir; | |
648 | } | |
649 | ||
650 | CDir *CInode::add_dirfrag(CDir *dir) | |
651 | { | |
652 | assert(dirfrags.count(dir->dirfrag().frag) == 0); | |
653 | dirfrags[dir->dirfrag().frag] = dir; | |
654 | ||
655 | if (stickydir_ref > 0) { | |
656 | dir->state_set(CDir::STATE_STICKY); | |
657 | dir->get(CDir::PIN_STICKY); | |
658 | } | |
659 | ||
660 | maybe_export_pin(); | |
661 | ||
662 | return dir; | |
663 | } | |
664 | ||
665 | void CInode::close_dirfrag(frag_t fg) | |
666 | { | |
667 | dout(14) << "close_dirfrag " << fg << dendl; | |
668 | assert(dirfrags.count(fg)); | |
669 | ||
670 | CDir *dir = dirfrags[fg]; | |
671 | dir->remove_null_dentries(); | |
672 | ||
673 | // clear dirty flag | |
674 | if (dir->is_dirty()) | |
675 | dir->mark_clean(); | |
676 | ||
677 | if (stickydir_ref > 0) { | |
678 | dir->state_clear(CDir::STATE_STICKY); | |
679 | dir->put(CDir::PIN_STICKY); | |
680 | } | |
681 | ||
682 | // dump any remaining dentries, for debugging purposes | |
683 | for (CDir::map_t::iterator p = dir->items.begin(); | |
684 | p != dir->items.end(); | |
685 | ++p) | |
686 | dout(14) << "close_dirfrag LEFTOVER dn " << *p->second << dendl; | |
687 | ||
688 | assert(dir->get_num_ref() == 0); | |
689 | delete dir; | |
690 | dirfrags.erase(fg); | |
691 | } | |
692 | ||
693 | void CInode::close_dirfrags() | |
694 | { | |
695 | while (!dirfrags.empty()) | |
696 | close_dirfrag(dirfrags.begin()->first); | |
697 | } | |
698 | ||
699 | bool CInode::has_subtree_root_dirfrag(int auth) | |
700 | { | |
701 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
702 | p != dirfrags.end(); | |
703 | ++p) | |
704 | if (p->second->is_subtree_root() && | |
705 | (auth == -1 || p->second->dir_auth.first == auth)) | |
706 | return true; | |
707 | return false; | |
708 | } | |
709 | ||
710 | bool CInode::has_subtree_or_exporting_dirfrag() | |
711 | { | |
712 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
713 | p != dirfrags.end(); | |
714 | ++p) | |
715 | if (p->second->is_subtree_root() || | |
716 | p->second->state_test(CDir::STATE_EXPORTING)) | |
717 | return true; | |
718 | return false; | |
719 | } | |
720 | ||
721 | void CInode::get_stickydirs() | |
722 | { | |
723 | if (stickydir_ref == 0) { | |
724 | get(PIN_STICKYDIRS); | |
725 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
726 | p != dirfrags.end(); | |
727 | ++p) { | |
728 | p->second->state_set(CDir::STATE_STICKY); | |
729 | p->second->get(CDir::PIN_STICKY); | |
730 | } | |
731 | } | |
732 | stickydir_ref++; | |
733 | } | |
734 | ||
735 | void CInode::put_stickydirs() | |
736 | { | |
737 | assert(stickydir_ref > 0); | |
738 | stickydir_ref--; | |
739 | if (stickydir_ref == 0) { | |
740 | put(PIN_STICKYDIRS); | |
741 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
742 | p != dirfrags.end(); | |
743 | ++p) { | |
744 | p->second->state_clear(CDir::STATE_STICKY); | |
745 | p->second->put(CDir::PIN_STICKY); | |
746 | } | |
747 | } | |
748 | } | |
749 | ||
750 | ||
751 | ||
752 | ||
753 | ||
754 | // pins | |
755 | ||
756 | void CInode::first_get() | |
757 | { | |
758 | // pin my dentry? | |
759 | if (parent) | |
760 | parent->get(CDentry::PIN_INODEPIN); | |
761 | } | |
762 | ||
763 | void CInode::last_put() | |
764 | { | |
765 | // unpin my dentry? | |
766 | if (parent) | |
767 | parent->put(CDentry::PIN_INODEPIN); | |
768 | } | |
769 | ||
770 | void CInode::_put() | |
771 | { | |
772 | if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent()) | |
773 | mdcache->maybe_eval_stray(this, true); | |
774 | } | |
775 | ||
776 | void CInode::add_remote_parent(CDentry *p) | |
777 | { | |
778 | if (remote_parents.empty()) | |
779 | get(PIN_REMOTEPARENT); | |
780 | remote_parents.insert(p); | |
781 | } | |
782 | void CInode::remove_remote_parent(CDentry *p) | |
783 | { | |
784 | remote_parents.erase(p); | |
785 | if (remote_parents.empty()) | |
786 | put(PIN_REMOTEPARENT); | |
787 | } | |
788 | ||
789 | ||
790 | ||
791 | ||
792 | CDir *CInode::get_parent_dir() | |
793 | { | |
794 | if (parent) | |
795 | return parent->dir; | |
796 | return NULL; | |
797 | } | |
798 | CDir *CInode::get_projected_parent_dir() | |
799 | { | |
800 | CDentry *p = get_projected_parent_dn(); | |
801 | if (p) | |
802 | return p->dir; | |
803 | return NULL; | |
804 | } | |
805 | CInode *CInode::get_parent_inode() | |
806 | { | |
807 | if (parent) | |
808 | return parent->dir->inode; | |
809 | return NULL; | |
810 | } | |
811 | ||
812 | bool CInode::is_projected_ancestor_of(CInode *other) | |
813 | { | |
814 | while (other) { | |
815 | if (other == this) | |
816 | return true; | |
817 | if (!other->get_projected_parent_dn()) | |
818 | break; | |
819 | other = other->get_projected_parent_dn()->get_dir()->get_inode(); | |
820 | } | |
821 | return false; | |
822 | } | |
823 | ||
824 | /* | |
825 | * Because a non-directory inode may have multiple links, the use_parent | |
826 | * argument allows selecting which parent to use for path construction. This | |
827 | * argument is only meaningful for the final component (i.e. the first of the | |
828 | * nested calls) because directories cannot have multiple hard links. If | |
829 | * use_parent is NULL and projected is true, the primary parent's projected | |
830 | * inode is used all the way up the path chain. Otherwise the primary parent | |
831 | * stable inode is used. | |
832 | */ | |
833 | void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const | |
834 | { | |
835 | if (!use_parent) { | |
836 | use_parent = projected ? get_projected_parent_dn() : parent; | |
837 | } | |
838 | ||
839 | if (use_parent) { | |
840 | use_parent->make_path_string(s, projected); | |
841 | } else if (is_root()) { | |
842 | s = ""; | |
843 | } else if (is_mdsdir()) { | |
844 | char t[40]; | |
845 | uint64_t eino(ino()); | |
846 | eino -= MDS_INO_MDSDIR_OFFSET; | |
847 | snprintf(t, sizeof(t), "~mds%" PRId64, eino); | |
848 | s = t; | |
849 | } else { | |
850 | char n[40]; | |
851 | uint64_t eino(ino()); | |
852 | snprintf(n, sizeof(n), "#%" PRIx64, eino); | |
853 | s += n; | |
854 | } | |
855 | } | |
856 | ||
857 | void CInode::make_path(filepath& fp, bool projected) const | |
858 | { | |
859 | const CDentry *use_parent = projected ? get_projected_parent_dn() : parent; | |
860 | if (use_parent) { | |
861 | assert(!is_base()); | |
862 | use_parent->make_path(fp, projected); | |
863 | } else { | |
864 | fp = filepath(ino()); | |
865 | } | |
866 | } | |
867 | ||
868 | void CInode::name_stray_dentry(string& dname) | |
869 | { | |
870 | char s[20]; | |
871 | snprintf(s, sizeof(s), "%llx", (unsigned long long)inode.ino.val); | |
872 | dname = s; | |
873 | } | |
874 | ||
875 | version_t CInode::pre_dirty() | |
876 | { | |
877 | version_t pv; | |
878 | CDentry* _cdentry = get_projected_parent_dn(); | |
879 | if (_cdentry) { | |
880 | pv = _cdentry->pre_dirty(get_projected_version()); | |
881 | dout(10) << "pre_dirty " << pv << " (current v " << inode.version << ")" << dendl; | |
882 | } else { | |
883 | assert(is_base()); | |
884 | pv = get_projected_version() + 1; | |
885 | } | |
886 | // force update backtrace for old format inode (see inode_t::decode) | |
887 | if (inode.backtrace_version == 0 && !projected_nodes.empty()) { | |
888 | inode_t *pi = projected_nodes.back()->inode; | |
889 | if (pi->backtrace_version == 0) | |
890 | pi->update_backtrace(pv); | |
891 | } | |
892 | return pv; | |
893 | } | |
894 | ||
895 | void CInode::_mark_dirty(LogSegment *ls) | |
896 | { | |
897 | if (!state_test(STATE_DIRTY)) { | |
898 | state_set(STATE_DIRTY); | |
899 | get(PIN_DIRTY); | |
900 | assert(ls); | |
901 | } | |
902 | ||
903 | // move myself to this segment's dirty list | |
904 | if (ls) | |
905 | ls->dirty_inodes.push_back(&item_dirty); | |
906 | } | |
907 | ||
908 | void CInode::mark_dirty(version_t pv, LogSegment *ls) { | |
909 | ||
910 | dout(10) << "mark_dirty " << *this << dendl; | |
911 | ||
912 | /* | |
913 | NOTE: I may already be dirty, but this fn _still_ needs to be called so that | |
914 | the directory is (perhaps newly) dirtied, and so that parent_dir_version is | |
915 | updated below. | |
916 | */ | |
917 | ||
918 | // only auth can get dirty. "dirty" async data in replicas is relative to | |
919 | // filelock state, not the dirty flag. | |
920 | assert(is_auth()); | |
921 | ||
922 | // touch my private version | |
923 | assert(inode.version < pv); | |
924 | inode.version = pv; | |
925 | _mark_dirty(ls); | |
926 | ||
927 | // mark dentry too | |
928 | if (parent) | |
929 | parent->mark_dirty(pv, ls); | |
930 | } | |
931 | ||
932 | ||
933 | void CInode::mark_clean() | |
934 | { | |
935 | dout(10) << " mark_clean " << *this << dendl; | |
936 | if (state_test(STATE_DIRTY)) { | |
937 | state_clear(STATE_DIRTY); | |
938 | put(PIN_DIRTY); | |
939 | ||
940 | // remove myself from ls dirty list | |
941 | item_dirty.remove_myself(); | |
942 | } | |
943 | } | |
944 | ||
945 | ||
946 | // -------------- | |
947 | // per-inode storage | |
948 | // (currently for root inode only) | |
949 | ||
950 | struct C_IO_Inode_Stored : public CInodeIOContext { | |
951 | version_t version; | |
952 | Context *fin; | |
953 | C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {} | |
954 | void finish(int r) override { | |
955 | in->_stored(r, version, fin); | |
956 | } | |
957 | }; | |
958 | ||
959 | object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, const char *suffix) | |
960 | { | |
961 | char n[60]; | |
962 | snprintf(n, sizeof(n), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)fg, suffix ? suffix : ""); | |
963 | return object_t(n); | |
964 | } | |
965 | ||
966 | void CInode::store(MDSInternalContextBase *fin) | |
967 | { | |
968 | dout(10) << "store " << get_version() << dendl; | |
969 | assert(is_base()); | |
970 | ||
971 | if (snaprealm) | |
972 | purge_stale_snap_data(snaprealm->get_snaps()); | |
973 | ||
974 | // encode | |
975 | bufferlist bl; | |
976 | string magic = CEPH_FS_ONDISK_MAGIC; | |
977 | ::encode(magic, bl); | |
978 | encode_store(bl, mdcache->mds->mdsmap->get_up_features()); | |
979 | ||
980 | // write it. | |
981 | SnapContext snapc; | |
982 | ObjectOperation m; | |
983 | m.write_full(bl); | |
984 | ||
985 | object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode"); | |
986 | object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool()); | |
987 | ||
988 | Context *newfin = | |
989 | new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin), | |
990 | mdcache->mds->finisher); | |
991 | mdcache->mds->objecter->mutate(oid, oloc, m, snapc, | |
992 | ceph::real_clock::now(), 0, | |
993 | newfin); | |
994 | } | |
995 | ||
996 | void CInode::_stored(int r, version_t v, Context *fin) | |
997 | { | |
998 | if (r < 0) { | |
999 | dout(1) << "store error " << r << " v " << v << " on " << *this << dendl; | |
d2e6a577 FG |
1000 | mdcache->mds->clog->error() << "failed to store inode " << ino() |
1001 | << " object: " << cpp_strerror(r); | |
7c673cae FG |
1002 | mdcache->mds->handle_write_error(r); |
1003 | fin->complete(r); | |
1004 | return; | |
1005 | } | |
1006 | ||
1007 | dout(10) << "_stored " << v << " on " << *this << dendl; | |
1008 | if (v == get_projected_version()) | |
1009 | mark_clean(); | |
1010 | ||
1011 | fin->complete(0); | |
1012 | } | |
1013 | ||
1014 | void CInode::flush(MDSInternalContextBase *fin) | |
1015 | { | |
1016 | dout(10) << "flush " << *this << dendl; | |
1017 | assert(is_auth() && can_auth_pin()); | |
1018 | ||
1019 | MDSGatherBuilder gather(g_ceph_context); | |
1020 | ||
1021 | if (is_dirty_parent()) { | |
1022 | store_backtrace(gather.new_sub()); | |
1023 | } | |
1024 | if (is_dirty()) { | |
1025 | if (is_base()) { | |
1026 | store(gather.new_sub()); | |
1027 | } else { | |
1028 | parent->dir->commit(0, gather.new_sub()); | |
1029 | } | |
1030 | } | |
1031 | ||
1032 | if (gather.has_subs()) { | |
1033 | gather.set_finisher(fin); | |
1034 | gather.activate(); | |
1035 | } else { | |
1036 | fin->complete(0); | |
1037 | } | |
1038 | } | |
1039 | ||
1040 | struct C_IO_Inode_Fetched : public CInodeIOContext { | |
1041 | bufferlist bl, bl2; | |
1042 | Context *fin; | |
1043 | C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {} | |
1044 | void finish(int r) override { | |
1045 | // Ignore 'r', because we fetch from two places, so r is usually ENOENT | |
1046 | in->_fetched(bl, bl2, fin); | |
1047 | } | |
1048 | }; | |
1049 | ||
1050 | void CInode::fetch(MDSInternalContextBase *fin) | |
1051 | { | |
1052 | dout(10) << "fetch" << dendl; | |
1053 | ||
1054 | C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin); | |
1055 | C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher)); | |
1056 | ||
1057 | object_t oid = CInode::get_object_name(ino(), frag_t(), ""); | |
1058 | object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool()); | |
1059 | ||
1060 | // Old on-disk format: inode stored in xattr of a dirfrag | |
1061 | ObjectOperation rd; | |
1062 | rd.getxattr("inode", &c->bl, NULL); | |
1063 | mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub()); | |
1064 | ||
1065 | // Current on-disk format: inode stored in a .inode object | |
1066 | object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode"); | |
1067 | mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub()); | |
1068 | ||
1069 | gather.activate(); | |
1070 | } | |
1071 | ||
1072 | void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin) | |
1073 | { | |
1074 | dout(10) << "_fetched got " << bl.length() << " and " << bl2.length() << dendl; | |
1075 | bufferlist::iterator p; | |
1076 | if (bl2.length()) { | |
1077 | p = bl2.begin(); | |
1078 | } else if (bl.length()) { | |
1079 | p = bl.begin(); | |
1080 | } else { | |
d2e6a577 | 1081 | derr << "No data while reading inode " << ino() << dendl; |
7c673cae FG |
1082 | fin->complete(-ENOENT); |
1083 | return; | |
1084 | } | |
1085 | ||
1086 | // Attempt decode | |
1087 | try { | |
1088 | string magic; | |
1089 | ::decode(magic, p); | |
1090 | dout(10) << " magic is '" << magic << "' (expecting '" | |
1091 | << CEPH_FS_ONDISK_MAGIC << "')" << dendl; | |
1092 | if (magic != CEPH_FS_ONDISK_MAGIC) { | |
1093 | dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC | |
1094 | << "'" << dendl; | |
1095 | fin->complete(-EINVAL); | |
1096 | } else { | |
1097 | decode_store(p); | |
1098 | dout(10) << "_fetched " << *this << dendl; | |
1099 | fin->complete(0); | |
1100 | } | |
1101 | } catch (buffer::error &err) { | |
d2e6a577 | 1102 | derr << "Corrupt inode " << ino() << ": " << err << dendl; |
7c673cae FG |
1103 | fin->complete(-EINVAL); |
1104 | return; | |
1105 | } | |
1106 | } | |
1107 | ||
1108 | void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt) | |
1109 | { | |
1110 | bt.ino = inode.ino; | |
1111 | bt.ancestors.clear(); | |
1112 | bt.pool = pool; | |
1113 | ||
1114 | CInode *in = this; | |
1115 | CDentry *pdn = get_parent_dn(); | |
1116 | while (pdn) { | |
1117 | CInode *diri = pdn->get_dir()->get_inode(); | |
1118 | bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, in->inode.version)); | |
1119 | in = diri; | |
1120 | pdn = in->get_parent_dn(); | |
1121 | } | |
1122 | for (compact_set<int64_t>::iterator i = inode.old_pools.begin(); | |
1123 | i != inode.old_pools.end(); | |
1124 | ++i) { | |
1125 | // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0) | |
1126 | if (*i != pool) | |
1127 | bt.old_pools.insert(*i); | |
1128 | } | |
1129 | } | |
1130 | ||
1131 | struct C_IO_Inode_StoredBacktrace : public CInodeIOContext { | |
1132 | version_t version; | |
1133 | Context *fin; | |
1134 | C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {} | |
1135 | void finish(int r) override { | |
1136 | in->_stored_backtrace(r, version, fin); | |
1137 | } | |
1138 | }; | |
1139 | ||
1140 | void CInode::store_backtrace(MDSInternalContextBase *fin, int op_prio) | |
1141 | { | |
1142 | dout(10) << "store_backtrace on " << *this << dendl; | |
1143 | assert(is_dirty_parent()); | |
1144 | ||
1145 | if (op_prio < 0) | |
1146 | op_prio = CEPH_MSG_PRIO_DEFAULT; | |
1147 | ||
1148 | auth_pin(this); | |
1149 | ||
1150 | const int64_t pool = get_backtrace_pool(); | |
1151 | inode_backtrace_t bt; | |
1152 | build_backtrace(pool, bt); | |
1153 | bufferlist parent_bl; | |
1154 | ::encode(bt, parent_bl); | |
1155 | ||
1156 | ObjectOperation op; | |
1157 | op.priority = op_prio; | |
1158 | op.create(false); | |
1159 | op.setxattr("parent", parent_bl); | |
1160 | ||
1161 | bufferlist layout_bl; | |
1162 | ::encode(inode.layout, layout_bl, mdcache->mds->mdsmap->get_up_features()); | |
1163 | op.setxattr("layout", layout_bl); | |
1164 | ||
1165 | SnapContext snapc; | |
1166 | object_t oid = get_object_name(ino(), frag_t(), ""); | |
1167 | object_locator_t oloc(pool); | |
1168 | Context *fin2 = new C_OnFinisher( | |
1169 | new C_IO_Inode_StoredBacktrace(this, inode.backtrace_version, fin), | |
1170 | mdcache->mds->finisher); | |
1171 | ||
1172 | if (!state_test(STATE_DIRTYPOOL) || inode.old_pools.empty()) { | |
1173 | dout(20) << __func__ << ": no dirtypool or no old pools" << dendl; | |
1174 | mdcache->mds->objecter->mutate(oid, oloc, op, snapc, | |
1175 | ceph::real_clock::now(), | |
1176 | 0, fin2); | |
1177 | return; | |
1178 | } | |
1179 | ||
1180 | C_GatherBuilder gather(g_ceph_context, fin2); | |
1181 | mdcache->mds->objecter->mutate(oid, oloc, op, snapc, | |
1182 | ceph::real_clock::now(), | |
1183 | 0, gather.new_sub()); | |
1184 | ||
1185 | // In the case where DIRTYPOOL is set, we update all old pools backtraces | |
1186 | // such that anyone reading them will see the new pool ID in | |
1187 | // inode_backtrace_t::pool and go read everything else from there. | |
1188 | for (compact_set<int64_t>::iterator p = inode.old_pools.begin(); | |
1189 | p != inode.old_pools.end(); | |
1190 | ++p) { | |
1191 | if (*p == pool) | |
1192 | continue; | |
1193 | ||
1194 | dout(20) << __func__ << ": updating old pool " << *p << dendl; | |
1195 | ||
1196 | ObjectOperation op; | |
1197 | op.priority = op_prio; | |
1198 | op.create(false); | |
1199 | op.setxattr("parent", parent_bl); | |
1200 | ||
1201 | object_locator_t oloc(*p); | |
1202 | mdcache->mds->objecter->mutate(oid, oloc, op, snapc, | |
1203 | ceph::real_clock::now(), | |
1204 | 0, gather.new_sub()); | |
1205 | } | |
1206 | gather.activate(); | |
1207 | } | |
1208 | ||
1209 | void CInode::_stored_backtrace(int r, version_t v, Context *fin) | |
1210 | { | |
1211 | if (r == -ENOENT) { | |
1212 | const int64_t pool = get_backtrace_pool(); | |
1213 | bool exists = mdcache->mds->objecter->with_osdmap( | |
1214 | [pool](const OSDMap &osd_map) { | |
1215 | return osd_map.have_pg_pool(pool); | |
1216 | }); | |
1217 | ||
1218 | // This ENOENT is because the pool doesn't exist (the user deleted it | |
1219 | // out from under us), so the backtrace can never be written, so pretend | |
1220 | // to succeed so that the user can proceed to e.g. delete the file. | |
1221 | if (!exists) { | |
1222 | dout(4) << "store_backtrace got ENOENT: a data pool was deleted " | |
1223 | "beneath us!" << dendl; | |
1224 | r = 0; | |
1225 | } | |
1226 | } | |
1227 | ||
1228 | if (r < 0) { | |
1229 | dout(1) << "store backtrace error " << r << " v " << v << dendl; | |
1230 | mdcache->mds->clog->error() << "failed to store backtrace on ino " | |
1231 | << ino() << " object" | |
1232 | << ", pool " << get_backtrace_pool() | |
1233 | << ", errno " << r; | |
1234 | mdcache->mds->handle_write_error(r); | |
1235 | if (fin) | |
1236 | fin->complete(r); | |
1237 | return; | |
1238 | } | |
1239 | ||
1240 | dout(10) << "_stored_backtrace v " << v << dendl; | |
1241 | ||
1242 | auth_unpin(this); | |
1243 | if (v == inode.backtrace_version) | |
1244 | clear_dirty_parent(); | |
1245 | if (fin) | |
1246 | fin->complete(0); | |
1247 | } | |
1248 | ||
1249 | void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace) | |
1250 | { | |
1251 | mdcache->fetch_backtrace(inode.ino, get_backtrace_pool(), *backtrace, fin); | |
1252 | } | |
1253 | ||
1254 | void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool) | |
1255 | { | |
1256 | if (!state_test(STATE_DIRTYPARENT)) { | |
1257 | dout(10) << "mark_dirty_parent" << dendl; | |
1258 | state_set(STATE_DIRTYPARENT); | |
1259 | get(PIN_DIRTYPARENT); | |
1260 | assert(ls); | |
1261 | } | |
1262 | if (dirty_pool) | |
1263 | state_set(STATE_DIRTYPOOL); | |
1264 | if (ls) | |
1265 | ls->dirty_parent_inodes.push_back(&item_dirty_parent); | |
1266 | } | |
1267 | ||
1268 | void CInode::clear_dirty_parent() | |
1269 | { | |
1270 | if (state_test(STATE_DIRTYPARENT)) { | |
1271 | dout(10) << "clear_dirty_parent" << dendl; | |
1272 | state_clear(STATE_DIRTYPARENT); | |
1273 | state_clear(STATE_DIRTYPOOL); | |
1274 | put(PIN_DIRTYPARENT); | |
1275 | item_dirty_parent.remove_myself(); | |
1276 | } | |
1277 | } | |
1278 | ||
1279 | void CInode::verify_diri_backtrace(bufferlist &bl, int err) | |
1280 | { | |
1281 | if (is_base() || is_dirty_parent() || !is_auth()) | |
1282 | return; | |
1283 | ||
1284 | dout(10) << "verify_diri_backtrace" << dendl; | |
1285 | ||
1286 | if (err == 0) { | |
1287 | inode_backtrace_t backtrace; | |
1288 | ::decode(backtrace, bl); | |
1289 | CDentry *pdn = get_parent_dn(); | |
1290 | if (backtrace.ancestors.empty() || | |
1291 | backtrace.ancestors[0].dname != pdn->name || | |
1292 | backtrace.ancestors[0].dirino != pdn->get_dir()->ino()) | |
1293 | err = -EINVAL; | |
1294 | } | |
1295 | ||
1296 | if (err) { | |
1297 | MDSRank *mds = mdcache->mds; | |
d2e6a577 | 1298 | mds->clog->error() << "bad backtrace on directory inode " << ino(); |
7c673cae FG |
1299 | assert(!"bad backtrace" == (g_conf->mds_verify_backtrace > 1)); |
1300 | ||
1301 | _mark_dirty_parent(mds->mdlog->get_current_segment(), false); | |
1302 | mds->mdlog->flush(); | |
1303 | } | |
1304 | } | |
1305 | ||
1306 | // ------------------ | |
1307 | // parent dir | |
1308 | ||
1309 | ||
1310 | void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features, | |
1311 | const bufferlist *snap_blob) const | |
1312 | { | |
1313 | ::encode(inode, bl, features); | |
1314 | if (is_symlink()) | |
1315 | ::encode(symlink, bl); | |
1316 | ::encode(dirfragtree, bl); | |
1317 | ::encode(xattrs, bl); | |
1318 | if (snap_blob) | |
1319 | ::encode(*snap_blob, bl); | |
1320 | else | |
1321 | ::encode(bufferlist(), bl); | |
1322 | ::encode(old_inodes, bl, features); | |
1323 | ::encode(oldest_snap, bl); | |
1324 | ::encode(damage_flags, bl); | |
1325 | } | |
1326 | ||
1327 | void InodeStoreBase::encode(bufferlist &bl, uint64_t features, | |
1328 | const bufferlist *snap_blob) const | |
1329 | { | |
1330 | ENCODE_START(6, 4, bl); | |
1331 | encode_bare(bl, features, snap_blob); | |
1332 | ENCODE_FINISH(bl); | |
1333 | } | |
1334 | ||
1335 | void CInode::encode_store(bufferlist& bl, uint64_t features) | |
1336 | { | |
1337 | bufferlist snap_blob; | |
1338 | encode_snap_blob(snap_blob); | |
1339 | InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(), | |
1340 | &snap_blob); | |
1341 | } | |
1342 | ||
1343 | void InodeStoreBase::decode_bare(bufferlist::iterator &bl, | |
1344 | bufferlist& snap_blob, __u8 struct_v) | |
1345 | { | |
1346 | ::decode(inode, bl); | |
1347 | if (is_symlink()) | |
1348 | ::decode(symlink, bl); | |
1349 | ::decode(dirfragtree, bl); | |
1350 | ::decode(xattrs, bl); | |
1351 | ::decode(snap_blob, bl); | |
1352 | ||
1353 | ::decode(old_inodes, bl); | |
1354 | if (struct_v == 2 && inode.is_dir()) { | |
1355 | bool default_layout_exists; | |
1356 | ::decode(default_layout_exists, bl); | |
1357 | if (default_layout_exists) { | |
1358 | ::decode(struct_v, bl); // this was a default_file_layout | |
1359 | ::decode(inode.layout, bl); // but we only care about the layout portion | |
1360 | } | |
1361 | } | |
1362 | ||
1363 | if (struct_v >= 5) { | |
1364 | // InodeStore is embedded in dentries without proper versioning, so | |
1365 | // we consume up to the end of the buffer | |
1366 | if (!bl.end()) { | |
1367 | ::decode(oldest_snap, bl); | |
1368 | } | |
1369 | ||
1370 | if (!bl.end()) { | |
1371 | ::decode(damage_flags, bl); | |
1372 | } | |
1373 | } | |
1374 | } | |
1375 | ||
1376 | ||
1377 | void InodeStoreBase::decode(bufferlist::iterator &bl, bufferlist& snap_blob) | |
1378 | { | |
1379 | DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); | |
1380 | decode_bare(bl, snap_blob, struct_v); | |
1381 | DECODE_FINISH(bl); | |
1382 | } | |
1383 | ||
1384 | void CInode::decode_store(bufferlist::iterator& bl) | |
1385 | { | |
1386 | bufferlist snap_blob; | |
1387 | InodeStoreBase::decode(bl, snap_blob); | |
1388 | decode_snap_blob(snap_blob); | |
1389 | } | |
1390 | ||
1391 | // ------------------ | |
1392 | // locking | |
1393 | ||
1394 | void CInode::set_object_info(MDSCacheObjectInfo &info) | |
1395 | { | |
1396 | info.ino = ino(); | |
1397 | info.snapid = last; | |
1398 | } | |
1399 | ||
1400 | void CInode::encode_lock_state(int type, bufferlist& bl) | |
1401 | { | |
1402 | ::encode(first, bl); | |
1403 | ||
1404 | switch (type) { | |
1405 | case CEPH_LOCK_IAUTH: | |
1406 | ::encode(inode.version, bl); | |
1407 | ::encode(inode.ctime, bl); | |
1408 | ::encode(inode.mode, bl); | |
1409 | ::encode(inode.uid, bl); | |
1410 | ::encode(inode.gid, bl); | |
1411 | break; | |
1412 | ||
1413 | case CEPH_LOCK_ILINK: | |
1414 | ::encode(inode.version, bl); | |
1415 | ::encode(inode.ctime, bl); | |
1416 | ::encode(inode.nlink, bl); | |
1417 | break; | |
1418 | ||
1419 | case CEPH_LOCK_IDFT: | |
1420 | if (is_auth()) { | |
1421 | ::encode(inode.version, bl); | |
1422 | } else { | |
1423 | // treat flushing as dirty when rejoining cache | |
1424 | bool dirty = dirfragtreelock.is_dirty_or_flushing(); | |
1425 | ::encode(dirty, bl); | |
1426 | } | |
1427 | { | |
1428 | // encode the raw tree | |
1429 | ::encode(dirfragtree, bl); | |
1430 | ||
1431 | // also specify which frags are mine | |
1432 | set<frag_t> myfrags; | |
1433 | list<CDir*> dfls; | |
1434 | get_dirfrags(dfls); | |
1435 | for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) | |
1436 | if ((*p)->is_auth()) { | |
1437 | frag_t fg = (*p)->get_frag(); | |
1438 | myfrags.insert(fg); | |
1439 | } | |
1440 | ::encode(myfrags, bl); | |
1441 | } | |
1442 | break; | |
1443 | ||
1444 | case CEPH_LOCK_IFILE: | |
1445 | if (is_auth()) { | |
1446 | ::encode(inode.version, bl); | |
1447 | ::encode(inode.ctime, bl); | |
1448 | ::encode(inode.mtime, bl); | |
1449 | ::encode(inode.atime, bl); | |
1450 | ::encode(inode.time_warp_seq, bl); | |
1451 | if (!is_dir()) { | |
1452 | ::encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features()); | |
1453 | ::encode(inode.size, bl); | |
1454 | ::encode(inode.truncate_seq, bl); | |
1455 | ::encode(inode.truncate_size, bl); | |
1456 | ::encode(inode.client_ranges, bl); | |
1457 | ::encode(inode.inline_data, bl); | |
1458 | } | |
1459 | } else { | |
1460 | // treat flushing as dirty when rejoining cache | |
1461 | bool dirty = filelock.is_dirty_or_flushing(); | |
1462 | ::encode(dirty, bl); | |
1463 | } | |
1464 | ||
1465 | { | |
1466 | dout(15) << "encode_lock_state inode.dirstat is " << inode.dirstat << dendl; | |
1467 | ::encode(inode.dirstat, bl); // only meaningful if i am auth. | |
1468 | bufferlist tmp; | |
1469 | __u32 n = 0; | |
1470 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
1471 | p != dirfrags.end(); | |
1472 | ++p) { | |
1473 | frag_t fg = p->first; | |
1474 | CDir *dir = p->second; | |
1475 | if (is_auth() || dir->is_auth()) { | |
1476 | fnode_t *pf = dir->get_projected_fnode(); | |
1477 | dout(15) << fg << " " << *dir << dendl; | |
1478 | dout(20) << fg << " fragstat " << pf->fragstat << dendl; | |
1479 | dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl; | |
1480 | ::encode(fg, tmp); | |
1481 | ::encode(dir->first, tmp); | |
1482 | ::encode(pf->fragstat, tmp); | |
1483 | ::encode(pf->accounted_fragstat, tmp); | |
1484 | n++; | |
1485 | } | |
1486 | } | |
1487 | ::encode(n, bl); | |
1488 | bl.claim_append(tmp); | |
1489 | } | |
1490 | break; | |
1491 | ||
1492 | case CEPH_LOCK_INEST: | |
1493 | if (is_auth()) { | |
1494 | ::encode(inode.version, bl); | |
1495 | } else { | |
1496 | // treat flushing as dirty when rejoining cache | |
1497 | bool dirty = nestlock.is_dirty_or_flushing(); | |
1498 | ::encode(dirty, bl); | |
1499 | } | |
1500 | { | |
1501 | dout(15) << "encode_lock_state inode.rstat is " << inode.rstat << dendl; | |
1502 | ::encode(inode.rstat, bl); // only meaningful if i am auth. | |
1503 | bufferlist tmp; | |
1504 | __u32 n = 0; | |
1505 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
1506 | p != dirfrags.end(); | |
1507 | ++p) { | |
1508 | frag_t fg = p->first; | |
1509 | CDir *dir = p->second; | |
1510 | if (is_auth() || dir->is_auth()) { | |
1511 | fnode_t *pf = dir->get_projected_fnode(); | |
1512 | dout(10) << fg << " " << *dir << dendl; | |
1513 | dout(10) << fg << " " << pf->rstat << dendl; | |
1514 | dout(10) << fg << " " << pf->rstat << dendl; | |
1515 | dout(10) << fg << " " << dir->dirty_old_rstat << dendl; | |
1516 | ::encode(fg, tmp); | |
1517 | ::encode(dir->first, tmp); | |
1518 | ::encode(pf->rstat, tmp); | |
1519 | ::encode(pf->accounted_rstat, tmp); | |
1520 | ::encode(dir->dirty_old_rstat, tmp); | |
1521 | n++; | |
1522 | } | |
1523 | } | |
1524 | ::encode(n, bl); | |
1525 | bl.claim_append(tmp); | |
1526 | } | |
1527 | break; | |
1528 | ||
1529 | case CEPH_LOCK_IXATTR: | |
1530 | ::encode(inode.version, bl); | |
1531 | ::encode(inode.ctime, bl); | |
1532 | ::encode(xattrs, bl); | |
1533 | break; | |
1534 | ||
1535 | case CEPH_LOCK_ISNAP: | |
1536 | ::encode(inode.version, bl); | |
1537 | ::encode(inode.ctime, bl); | |
1538 | encode_snap(bl); | |
1539 | break; | |
1540 | ||
1541 | case CEPH_LOCK_IFLOCK: | |
1542 | ::encode(inode.version, bl); | |
1543 | _encode_file_locks(bl); | |
1544 | break; | |
1545 | ||
1546 | case CEPH_LOCK_IPOLICY: | |
1547 | if (inode.is_dir()) { | |
1548 | ::encode(inode.version, bl); | |
1549 | ::encode(inode.ctime, bl); | |
1550 | ::encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features()); | |
1551 | ::encode(inode.quota, bl); | |
1552 | ::encode(inode.export_pin, bl); | |
1553 | } | |
1554 | break; | |
1555 | ||
1556 | default: | |
1557 | ceph_abort(); | |
1558 | } | |
1559 | } | |
1560 | ||
1561 | ||
1562 | /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ | |
1563 | ||
1564 | void CInode::decode_lock_state(int type, bufferlist& bl) | |
1565 | { | |
1566 | bufferlist::iterator p = bl.begin(); | |
1567 | utime_t tm; | |
1568 | ||
1569 | snapid_t newfirst; | |
1570 | ::decode(newfirst, p); | |
1571 | ||
1572 | if (!is_auth() && newfirst != first) { | |
1573 | dout(10) << "decode_lock_state first " << first << " -> " << newfirst << dendl; | |
1574 | assert(newfirst > first); | |
1575 | if (!is_multiversion() && parent) { | |
1576 | assert(parent->first == first); | |
1577 | parent->first = newfirst; | |
1578 | } | |
1579 | first = newfirst; | |
1580 | } | |
1581 | ||
1582 | switch (type) { | |
1583 | case CEPH_LOCK_IAUTH: | |
1584 | ::decode(inode.version, p); | |
1585 | ::decode(tm, p); | |
1586 | if (inode.ctime < tm) inode.ctime = tm; | |
1587 | ::decode(inode.mode, p); | |
1588 | ::decode(inode.uid, p); | |
1589 | ::decode(inode.gid, p); | |
1590 | break; | |
1591 | ||
1592 | case CEPH_LOCK_ILINK: | |
1593 | ::decode(inode.version, p); | |
1594 | ::decode(tm, p); | |
1595 | if (inode.ctime < tm) inode.ctime = tm; | |
1596 | ::decode(inode.nlink, p); | |
1597 | break; | |
1598 | ||
1599 | case CEPH_LOCK_IDFT: | |
1600 | if (is_auth()) { | |
1601 | bool replica_dirty; | |
1602 | ::decode(replica_dirty, p); | |
1603 | if (replica_dirty) { | |
1604 | dout(10) << "decode_lock_state setting dftlock dirty flag" << dendl; | |
1605 | dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle | |
1606 | } | |
1607 | } else { | |
1608 | ::decode(inode.version, p); | |
1609 | } | |
1610 | { | |
1611 | fragtree_t temp; | |
1612 | ::decode(temp, p); | |
1613 | set<frag_t> authfrags; | |
1614 | ::decode(authfrags, p); | |
1615 | if (is_auth()) { | |
1616 | // auth. believe replica's auth frags only. | |
1617 | for (set<frag_t>::iterator p = authfrags.begin(); p != authfrags.end(); ++p) | |
1618 | if (!dirfragtree.is_leaf(*p)) { | |
1619 | dout(10) << " forcing frag " << *p << " to leaf (split|merge)" << dendl; | |
1620 | dirfragtree.force_to_leaf(g_ceph_context, *p); | |
1621 | dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle | |
1622 | } | |
1623 | } else { | |
1624 | // replica. take the new tree, BUT make sure any open | |
1625 | // dirfrags remain leaves (they may have split _after_ this | |
1626 | // dft was scattered, or we may still be be waiting on the | |
1627 | // notify from the auth) | |
1628 | dirfragtree.swap(temp); | |
1629 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
1630 | p != dirfrags.end(); | |
1631 | ++p) { | |
1632 | if (!dirfragtree.is_leaf(p->first)) { | |
1633 | dout(10) << " forcing open dirfrag " << p->first << " to leaf (racing with split|merge)" << dendl; | |
1634 | dirfragtree.force_to_leaf(g_ceph_context, p->first); | |
1635 | } | |
1636 | if (p->second->is_auth()) | |
1637 | p->second->state_clear(CDir::STATE_DIRTYDFT); | |
1638 | } | |
1639 | } | |
1640 | if (g_conf->mds_debug_frag) | |
1641 | verify_dirfrags(); | |
1642 | } | |
1643 | break; | |
1644 | ||
1645 | case CEPH_LOCK_IFILE: | |
1646 | if (!is_auth()) { | |
1647 | ::decode(inode.version, p); | |
1648 | ::decode(tm, p); | |
1649 | if (inode.ctime < tm) inode.ctime = tm; | |
1650 | ::decode(inode.mtime, p); | |
1651 | ::decode(inode.atime, p); | |
1652 | ::decode(inode.time_warp_seq, p); | |
1653 | if (!is_dir()) { | |
1654 | ::decode(inode.layout, p); | |
1655 | ::decode(inode.size, p); | |
1656 | ::decode(inode.truncate_seq, p); | |
1657 | ::decode(inode.truncate_size, p); | |
1658 | ::decode(inode.client_ranges, p); | |
1659 | ::decode(inode.inline_data, p); | |
1660 | } | |
1661 | } else { | |
1662 | bool replica_dirty; | |
1663 | ::decode(replica_dirty, p); | |
1664 | if (replica_dirty) { | |
1665 | dout(10) << "decode_lock_state setting filelock dirty flag" << dendl; | |
1666 | filelock.mark_dirty(); // ok bc we're auth and caller will handle | |
1667 | } | |
1668 | } | |
1669 | { | |
1670 | frag_info_t dirstat; | |
1671 | ::decode(dirstat, p); | |
1672 | if (!is_auth()) { | |
1673 | dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl; | |
1674 | inode.dirstat = dirstat; // take inode summation if replica | |
1675 | } | |
1676 | __u32 n; | |
1677 | ::decode(n, p); | |
1678 | dout(10) << " ...got " << n << " fragstats on " << *this << dendl; | |
1679 | while (n--) { | |
1680 | frag_t fg; | |
1681 | snapid_t fgfirst; | |
1682 | frag_info_t fragstat; | |
1683 | frag_info_t accounted_fragstat; | |
1684 | ::decode(fg, p); | |
1685 | ::decode(fgfirst, p); | |
1686 | ::decode(fragstat, p); | |
1687 | ::decode(accounted_fragstat, p); | |
1688 | dout(10) << fg << " [" << fgfirst << ",head] " << dendl; | |
1689 | dout(10) << fg << " fragstat " << fragstat << dendl; | |
1690 | dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl; | |
1691 | ||
1692 | CDir *dir = get_dirfrag(fg); | |
1693 | if (is_auth()) { | |
1694 | assert(dir); // i am auth; i had better have this dir open | |
1695 | dout(10) << fg << " first " << dir->first << " -> " << fgfirst | |
1696 | << " on " << *dir << dendl; | |
1697 | dir->first = fgfirst; | |
1698 | dir->fnode.fragstat = fragstat; | |
1699 | dir->fnode.accounted_fragstat = accounted_fragstat; | |
1700 | dir->first = fgfirst; | |
1701 | if (!(fragstat == accounted_fragstat)) { | |
1702 | dout(10) << fg << " setting filelock updated flag" << dendl; | |
1703 | filelock.mark_dirty(); // ok bc we're auth and caller will handle | |
1704 | } | |
1705 | } else { | |
1706 | if (dir && dir->is_auth()) { | |
1707 | dout(10) << fg << " first " << dir->first << " -> " << fgfirst | |
1708 | << " on " << *dir << dendl; | |
1709 | dir->first = fgfirst; | |
1710 | fnode_t *pf = dir->get_projected_fnode(); | |
1711 | finish_scatter_update(&filelock, dir, | |
1712 | inode.dirstat.version, pf->accounted_fragstat.version); | |
1713 | } | |
1714 | } | |
1715 | } | |
1716 | } | |
1717 | break; | |
1718 | ||
1719 | case CEPH_LOCK_INEST: | |
1720 | if (is_auth()) { | |
1721 | bool replica_dirty; | |
1722 | ::decode(replica_dirty, p); | |
1723 | if (replica_dirty) { | |
1724 | dout(10) << "decode_lock_state setting nestlock dirty flag" << dendl; | |
1725 | nestlock.mark_dirty(); // ok bc we're auth and caller will handle | |
1726 | } | |
1727 | } else { | |
1728 | ::decode(inode.version, p); | |
1729 | } | |
1730 | { | |
1731 | nest_info_t rstat; | |
1732 | ::decode(rstat, p); | |
1733 | if (!is_auth()) { | |
1734 | dout(10) << " taking inode rstat " << rstat << " for " << *this << dendl; | |
1735 | inode.rstat = rstat; // take inode summation if replica | |
1736 | } | |
1737 | __u32 n; | |
1738 | ::decode(n, p); | |
1739 | while (n--) { | |
1740 | frag_t fg; | |
1741 | snapid_t fgfirst; | |
1742 | nest_info_t rstat; | |
1743 | nest_info_t accounted_rstat; | |
1744 | compact_map<snapid_t,old_rstat_t> dirty_old_rstat; | |
1745 | ::decode(fg, p); | |
1746 | ::decode(fgfirst, p); | |
1747 | ::decode(rstat, p); | |
1748 | ::decode(accounted_rstat, p); | |
1749 | ::decode(dirty_old_rstat, p); | |
1750 | dout(10) << fg << " [" << fgfirst << ",head]" << dendl; | |
1751 | dout(10) << fg << " rstat " << rstat << dendl; | |
1752 | dout(10) << fg << " accounted_rstat " << accounted_rstat << dendl; | |
1753 | dout(10) << fg << " dirty_old_rstat " << dirty_old_rstat << dendl; | |
1754 | ||
1755 | CDir *dir = get_dirfrag(fg); | |
1756 | if (is_auth()) { | |
1757 | assert(dir); // i am auth; i had better have this dir open | |
1758 | dout(10) << fg << " first " << dir->first << " -> " << fgfirst | |
1759 | << " on " << *dir << dendl; | |
1760 | dir->first = fgfirst; | |
1761 | dir->fnode.rstat = rstat; | |
1762 | dir->fnode.accounted_rstat = accounted_rstat; | |
1763 | dir->dirty_old_rstat.swap(dirty_old_rstat); | |
1764 | if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) { | |
1765 | dout(10) << fg << " setting nestlock updated flag" << dendl; | |
1766 | nestlock.mark_dirty(); // ok bc we're auth and caller will handle | |
1767 | } | |
1768 | } else { | |
1769 | if (dir && dir->is_auth()) { | |
1770 | dout(10) << fg << " first " << dir->first << " -> " << fgfirst | |
1771 | << " on " << *dir << dendl; | |
1772 | dir->first = fgfirst; | |
1773 | fnode_t *pf = dir->get_projected_fnode(); | |
1774 | finish_scatter_update(&nestlock, dir, | |
1775 | inode.rstat.version, pf->accounted_rstat.version); | |
1776 | } | |
1777 | } | |
1778 | } | |
1779 | } | |
1780 | break; | |
1781 | ||
1782 | case CEPH_LOCK_IXATTR: | |
1783 | ::decode(inode.version, p); | |
1784 | ::decode(tm, p); | |
1785 | if (inode.ctime < tm) inode.ctime = tm; | |
1786 | ::decode(xattrs, p); | |
1787 | break; | |
1788 | ||
1789 | case CEPH_LOCK_ISNAP: | |
1790 | { | |
1791 | ::decode(inode.version, p); | |
1792 | ::decode(tm, p); | |
1793 | if (inode.ctime < tm) inode.ctime = tm; | |
1794 | snapid_t seq = 0; | |
1795 | if (snaprealm) | |
1796 | seq = snaprealm->srnode.seq; | |
1797 | decode_snap(p); | |
1798 | if (snaprealm && snaprealm->srnode.seq != seq) | |
1799 | mdcache->do_realm_invalidate_and_update_notify(this, seq ? CEPH_SNAP_OP_UPDATE:CEPH_SNAP_OP_SPLIT); | |
1800 | } | |
1801 | break; | |
1802 | ||
1803 | case CEPH_LOCK_IFLOCK: | |
1804 | ::decode(inode.version, p); | |
1805 | _decode_file_locks(p); | |
1806 | break; | |
1807 | ||
1808 | case CEPH_LOCK_IPOLICY: | |
1809 | if (inode.is_dir()) { | |
1810 | ::decode(inode.version, p); | |
1811 | ::decode(tm, p); | |
1812 | if (inode.ctime < tm) inode.ctime = tm; | |
1813 | ::decode(inode.layout, p); | |
1814 | ::decode(inode.quota, p); | |
31f18b77 | 1815 | mds_rank_t old_pin = inode.export_pin; |
7c673cae | 1816 | ::decode(inode.export_pin, p); |
31f18b77 | 1817 | maybe_export_pin(old_pin != inode.export_pin); |
7c673cae FG |
1818 | } |
1819 | break; | |
1820 | ||
1821 | default: | |
1822 | ceph_abort(); | |
1823 | } | |
1824 | } | |
1825 | ||
1826 | ||
1827 | bool CInode::is_dirty_scattered() | |
1828 | { | |
1829 | return | |
1830 | filelock.is_dirty_or_flushing() || | |
1831 | nestlock.is_dirty_or_flushing() || | |
1832 | dirfragtreelock.is_dirty_or_flushing(); | |
1833 | } | |
1834 | ||
1835 | void CInode::clear_scatter_dirty() | |
1836 | { | |
1837 | filelock.remove_dirty(); | |
1838 | nestlock.remove_dirty(); | |
1839 | dirfragtreelock.remove_dirty(); | |
1840 | } | |
1841 | ||
1842 | void CInode::clear_dirty_scattered(int type) | |
1843 | { | |
1844 | dout(10) << "clear_dirty_scattered " << type << " on " << *this << dendl; | |
b32b8144 | 1845 | assert(is_dir()); |
7c673cae FG |
1846 | switch (type) { |
1847 | case CEPH_LOCK_IFILE: | |
1848 | item_dirty_dirfrag_dir.remove_myself(); | |
1849 | break; | |
1850 | ||
1851 | case CEPH_LOCK_INEST: | |
1852 | item_dirty_dirfrag_nest.remove_myself(); | |
1853 | break; | |
1854 | ||
1855 | case CEPH_LOCK_IDFT: | |
1856 | item_dirty_dirfrag_dirfragtree.remove_myself(); | |
1857 | break; | |
1858 | ||
1859 | default: | |
1860 | ceph_abort(); | |
1861 | } | |
1862 | } | |
1863 | ||
1864 | ||
1865 | /* | |
1866 | * when we initially scatter a lock, we need to check if any of the dirfrags | |
1867 | * have out of date accounted_rstat/fragstat. if so, mark the lock stale. | |
1868 | */ | |
1869 | /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ | |
1870 | void CInode::start_scatter(ScatterLock *lock) | |
1871 | { | |
1872 | dout(10) << "start_scatter " << *lock << " on " << *this << dendl; | |
1873 | assert(is_auth()); | |
1874 | inode_t *pi = get_projected_inode(); | |
1875 | ||
1876 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
1877 | p != dirfrags.end(); | |
1878 | ++p) { | |
1879 | frag_t fg = p->first; | |
1880 | CDir *dir = p->second; | |
1881 | fnode_t *pf = dir->get_projected_fnode(); | |
1882 | dout(20) << fg << " " << *dir << dendl; | |
1883 | ||
1884 | if (!dir->is_auth()) | |
1885 | continue; | |
1886 | ||
1887 | switch (lock->get_type()) { | |
1888 | case CEPH_LOCK_IFILE: | |
1889 | finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version); | |
1890 | break; | |
1891 | ||
1892 | case CEPH_LOCK_INEST: | |
1893 | finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version); | |
1894 | break; | |
1895 | ||
1896 | case CEPH_LOCK_IDFT: | |
1897 | dir->state_clear(CDir::STATE_DIRTYDFT); | |
1898 | break; | |
1899 | } | |
1900 | } | |
1901 | } | |
1902 | ||
1903 | ||
1904 | class C_Inode_FragUpdate : public MDSLogContextBase { | |
1905 | protected: | |
1906 | CInode *in; | |
1907 | CDir *dir; | |
1908 | MutationRef mut; | |
1909 | MDSRank *get_mds() override {return in->mdcache->mds;} | |
1910 | void finish(int r) override { | |
1911 | in->_finish_frag_update(dir, mut); | |
1912 | } | |
1913 | ||
1914 | public: | |
1915 | C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {} | |
1916 | }; | |
1917 | ||
1918 | void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir, | |
1919 | version_t inode_version, version_t dir_accounted_version) | |
1920 | { | |
1921 | frag_t fg = dir->get_frag(); | |
1922 | assert(dir->is_auth()); | |
1923 | ||
1924 | if (dir->is_frozen()) { | |
1925 | dout(10) << "finish_scatter_update " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl; | |
1926 | } else if (dir->get_version() == 0) { | |
1927 | dout(10) << "finish_scatter_update " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl; | |
1928 | } else { | |
1929 | if (dir_accounted_version != inode_version) { | |
1930 | dout(10) << "finish_scatter_update " << fg << " journaling accounted scatterstat update v" << inode_version << dendl; | |
1931 | ||
1932 | MDLog *mdlog = mdcache->mds->mdlog; | |
1933 | MutationRef mut(new MutationImpl()); | |
1934 | mut->ls = mdlog->get_current_segment(); | |
1935 | ||
1936 | inode_t *pi = get_projected_inode(); | |
1937 | fnode_t *pf = dir->project_fnode(); | |
7c673cae FG |
1938 | |
1939 | const char *ename = 0; | |
1940 | switch (lock->get_type()) { | |
1941 | case CEPH_LOCK_IFILE: | |
1942 | pf->fragstat.version = pi->dirstat.version; | |
1943 | pf->accounted_fragstat = pf->fragstat; | |
1944 | ename = "lock ifile accounted scatter stat update"; | |
1945 | break; | |
1946 | case CEPH_LOCK_INEST: | |
1947 | pf->rstat.version = pi->rstat.version; | |
1948 | pf->accounted_rstat = pf->rstat; | |
1949 | ename = "lock inest accounted scatter stat update"; | |
c07f9fc5 FG |
1950 | |
1951 | if (!is_auth() && lock->get_state() == LOCK_MIX) { | |
1952 | dout(10) << "finish_scatter_update try to assimilate dirty rstat on " | |
1953 | << *dir << dendl; | |
1954 | dir->assimilate_dirty_rstat_inodes(); | |
1955 | } | |
1956 | ||
7c673cae FG |
1957 | break; |
1958 | default: | |
1959 | ceph_abort(); | |
1960 | } | |
1961 | ||
c07f9fc5 | 1962 | pf->version = dir->pre_dirty(); |
7c673cae FG |
1963 | mut->add_projected_fnode(dir); |
1964 | ||
1965 | EUpdate *le = new EUpdate(mdlog, ename); | |
1966 | mdlog->start_entry(le); | |
1967 | le->metablob.add_dir_context(dir); | |
1968 | le->metablob.add_dir(dir, true); | |
1969 | ||
1970 | assert(!dir->is_frozen()); | |
1971 | mut->auth_pin(dir); | |
c07f9fc5 FG |
1972 | |
1973 | if (lock->get_type() == CEPH_LOCK_INEST && | |
1974 | !is_auth() && lock->get_state() == LOCK_MIX) { | |
1975 | dout(10) << "finish_scatter_update finish assimilating dirty rstat on " | |
1976 | << *dir << dendl; | |
1977 | dir->assimilate_dirty_rstat_inodes_finish(mut, &le->metablob); | |
1978 | ||
1979 | if (!(pf->rstat == pf->accounted_rstat)) { | |
1980 | if (mut->wrlocks.count(&nestlock) == 0) { | |
1981 | mdcache->mds->locker->wrlock_force(&nestlock, mut); | |
1982 | } | |
1983 | ||
1984 | mdcache->mds->locker->mark_updated_scatterlock(&nestlock); | |
1985 | mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest); | |
1986 | } | |
1987 | } | |
7c673cae FG |
1988 | |
1989 | mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut)); | |
1990 | } else { | |
1991 | dout(10) << "finish_scatter_update " << fg << " accounted " << *lock | |
1992 | << " scatter stat unchanged at v" << dir_accounted_version << dendl; | |
1993 | } | |
1994 | } | |
1995 | } | |
1996 | ||
1997 | void CInode::_finish_frag_update(CDir *dir, MutationRef& mut) | |
1998 | { | |
1999 | dout(10) << "_finish_frag_update on " << *dir << dendl; | |
2000 | mut->apply(); | |
c07f9fc5 | 2001 | mdcache->mds->locker->drop_locks(mut.get()); |
7c673cae FG |
2002 | mut->cleanup(); |
2003 | } | |
2004 | ||
2005 | ||
2006 | /* | |
2007 | * when we gather a lock, we need to assimilate dirfrag changes into the inode | |
2008 | * state. it's possible we can't update the dirfrag accounted_rstat/fragstat | |
2009 | * because the frag is auth and frozen, or that the replica couldn't for the same | |
2010 | * reason. hopefully it will get updated the next time the lock cycles. | |
2011 | * | |
2012 | * we have two dimensions of behavior: | |
2013 | * - we may be (auth and !frozen), and able to update, or not. | |
2014 | * - the frag may be stale, or not. | |
2015 | * | |
2016 | * if the frag is non-stale, we want to assimilate the diff into the | |
2017 | * inode, regardless of whether it's auth or updateable. | |
2018 | * | |
2019 | * if we update the frag, we want to set accounted_fragstat = frag, | |
2020 | * both if we took the diff or it was stale and we are making it | |
2021 | * un-stale. | |
2022 | */ | |
2023 | /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ | |
2024 | void CInode::finish_scatter_gather_update(int type) | |
2025 | { | |
2026 | LogChannelRef clog = mdcache->mds->clog; | |
2027 | ||
2028 | dout(10) << "finish_scatter_gather_update " << type << " on " << *this << dendl; | |
2029 | assert(is_auth()); | |
2030 | ||
2031 | switch (type) { | |
2032 | case CEPH_LOCK_IFILE: | |
2033 | { | |
2034 | fragtree_t tmpdft = dirfragtree; | |
2035 | struct frag_info_t dirstat; | |
2036 | bool dirstat_valid = true; | |
2037 | ||
2038 | // adjust summation | |
2039 | assert(is_auth()); | |
2040 | inode_t *pi = get_projected_inode(); | |
2041 | ||
2042 | bool touched_mtime = false, touched_chattr = false; | |
2043 | dout(20) << " orig dirstat " << pi->dirstat << dendl; | |
2044 | pi->dirstat.version++; | |
2045 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
2046 | p != dirfrags.end(); | |
2047 | ++p) { | |
2048 | frag_t fg = p->first; | |
2049 | CDir *dir = p->second; | |
2050 | dout(20) << fg << " " << *dir << dendl; | |
2051 | ||
2052 | bool update; | |
2053 | if (dir->get_version() != 0) { | |
2054 | update = dir->is_auth() && !dir->is_frozen(); | |
2055 | } else { | |
2056 | update = false; | |
2057 | dirstat_valid = false; | |
2058 | } | |
2059 | ||
2060 | fnode_t *pf = dir->get_projected_fnode(); | |
2061 | if (update) | |
2062 | pf = dir->project_fnode(); | |
2063 | ||
2064 | if (pf->accounted_fragstat.version == pi->dirstat.version - 1) { | |
2065 | dout(20) << fg << " fragstat " << pf->fragstat << dendl; | |
2066 | dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl; | |
2067 | pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr); | |
2068 | } else { | |
2069 | dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl; | |
2070 | } | |
2071 | ||
2072 | if (pf->fragstat.nfiles < 0 || | |
2073 | pf->fragstat.nsubdirs < 0) { | |
2074 | clog->error() << "bad/negative dir size on " | |
2075 | << dir->dirfrag() << " " << pf->fragstat; | |
2076 | assert(!"bad/negative fragstat" == g_conf->mds_verify_scatter); | |
2077 | ||
2078 | if (pf->fragstat.nfiles < 0) | |
2079 | pf->fragstat.nfiles = 0; | |
2080 | if (pf->fragstat.nsubdirs < 0) | |
2081 | pf->fragstat.nsubdirs = 0; | |
2082 | } | |
2083 | ||
2084 | if (update) { | |
2085 | pf->accounted_fragstat = pf->fragstat; | |
2086 | pf->fragstat.version = pf->accounted_fragstat.version = pi->dirstat.version; | |
2087 | dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl; | |
2088 | } | |
2089 | ||
2090 | tmpdft.force_to_leaf(g_ceph_context, fg); | |
2091 | dirstat.add(pf->fragstat); | |
2092 | } | |
2093 | if (touched_mtime) | |
2094 | pi->mtime = pi->ctime = pi->dirstat.mtime; | |
2095 | if (touched_chattr) | |
2096 | pi->change_attr = pi->dirstat.change_attr; | |
2097 | dout(20) << " final dirstat " << pi->dirstat << dendl; | |
2098 | ||
2099 | if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) { | |
2100 | list<frag_t> ls; | |
2101 | tmpdft.get_leaves_under(frag_t(), ls); | |
2102 | for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) | |
2103 | if (!dirfrags.count(*p)) { | |
2104 | dirstat_valid = false; | |
2105 | break; | |
2106 | } | |
2107 | if (dirstat_valid) { | |
2108 | if (state_test(CInode::STATE_REPAIRSTATS)) { | |
2109 | dout(20) << " dirstat mismatch, fixing" << dendl; | |
2110 | } else { | |
2111 | clog->error() << "unmatched fragstat on " << ino() << ", inode has " | |
2112 | << pi->dirstat << ", dirfrags have " << dirstat; | |
2113 | assert(!"unmatched fragstat" == g_conf->mds_verify_scatter); | |
2114 | } | |
2115 | // trust the dirfrags for now | |
2116 | version_t v = pi->dirstat.version; | |
2117 | if (pi->dirstat.mtime > dirstat.mtime) | |
2118 | dirstat.mtime = pi->dirstat.mtime; | |
2119 | if (pi->dirstat.change_attr > dirstat.change_attr) | |
2120 | dirstat.change_attr = pi->dirstat.change_attr; | |
2121 | pi->dirstat = dirstat; | |
2122 | pi->dirstat.version = v; | |
2123 | } | |
2124 | } | |
2125 | ||
d2e6a577 FG |
2126 | if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0) |
2127 | { | |
2128 | std::string path; | |
2129 | make_path_string(path); | |
2130 | clog->error() << "Inconsistent statistics detected: fragstat on inode " | |
2131 | << ino() << " (" << path << "), inode has " << pi->dirstat; | |
7c673cae FG |
2132 | assert(!"bad/negative fragstat" == g_conf->mds_verify_scatter); |
2133 | ||
2134 | if (pi->dirstat.nfiles < 0) | |
2135 | pi->dirstat.nfiles = 0; | |
2136 | if (pi->dirstat.nsubdirs < 0) | |
2137 | pi->dirstat.nsubdirs = 0; | |
2138 | } | |
2139 | } | |
2140 | break; | |
2141 | ||
2142 | case CEPH_LOCK_INEST: | |
2143 | { | |
2144 | fragtree_t tmpdft = dirfragtree; | |
2145 | nest_info_t rstat; | |
2146 | rstat.rsubdirs = 1; | |
2147 | bool rstat_valid = true; | |
2148 | ||
2149 | // adjust summation | |
2150 | assert(is_auth()); | |
2151 | inode_t *pi = get_projected_inode(); | |
2152 | dout(20) << " orig rstat " << pi->rstat << dendl; | |
2153 | pi->rstat.version++; | |
2154 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
2155 | p != dirfrags.end(); | |
2156 | ++p) { | |
2157 | frag_t fg = p->first; | |
2158 | CDir *dir = p->second; | |
2159 | dout(20) << fg << " " << *dir << dendl; | |
2160 | ||
2161 | bool update; | |
2162 | if (dir->get_version() != 0) { | |
2163 | update = dir->is_auth() && !dir->is_frozen(); | |
2164 | } else { | |
2165 | update = false; | |
2166 | rstat_valid = false; | |
2167 | } | |
2168 | ||
2169 | fnode_t *pf = dir->get_projected_fnode(); | |
2170 | if (update) | |
2171 | pf = dir->project_fnode(); | |
2172 | ||
2173 | if (pf->accounted_rstat.version == pi->rstat.version-1) { | |
2174 | // only pull this frag's dirty rstat inodes into the frag if | |
2175 | // the frag is non-stale and updateable. if it's stale, | |
2176 | // that info will just get thrown out! | |
2177 | if (update) | |
2178 | dir->assimilate_dirty_rstat_inodes(); | |
2179 | ||
2180 | dout(20) << fg << " rstat " << pf->rstat << dendl; | |
2181 | dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl; | |
2182 | dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl; | |
2183 | mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, | |
2184 | dir->first, CEPH_NOSNAP, this, true); | |
2185 | for (compact_map<snapid_t,old_rstat_t>::iterator q = dir->dirty_old_rstat.begin(); | |
2186 | q != dir->dirty_old_rstat.end(); | |
2187 | ++q) | |
2188 | mdcache->project_rstat_frag_to_inode(q->second.rstat, q->second.accounted_rstat, | |
2189 | q->second.first, q->first, this, true); | |
2190 | if (update) // dir contents not valid if frozen or non-auth | |
2191 | dir->check_rstats(); | |
2192 | } else { | |
2193 | dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl; | |
2194 | } | |
2195 | if (update) { | |
2196 | pf->accounted_rstat = pf->rstat; | |
2197 | dir->dirty_old_rstat.clear(); | |
2198 | pf->rstat.version = pf->accounted_rstat.version = pi->rstat.version; | |
2199 | dir->check_rstats(); | |
2200 | dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl; | |
2201 | } | |
2202 | ||
2203 | tmpdft.force_to_leaf(g_ceph_context, fg); | |
2204 | rstat.add(pf->rstat); | |
2205 | } | |
2206 | dout(20) << " final rstat " << pi->rstat << dendl; | |
2207 | ||
2208 | if (rstat_valid && !rstat.same_sums(pi->rstat)) { | |
2209 | list<frag_t> ls; | |
2210 | tmpdft.get_leaves_under(frag_t(), ls); | |
2211 | for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) | |
2212 | if (!dirfrags.count(*p)) { | |
2213 | rstat_valid = false; | |
2214 | break; | |
2215 | } | |
2216 | if (rstat_valid) { | |
2217 | if (state_test(CInode::STATE_REPAIRSTATS)) { | |
2218 | dout(20) << " rstat mismatch, fixing" << dendl; | |
2219 | } else { | |
d2e6a577 FG |
2220 | clog->error() << "inconsistent rstat on inode " << ino() |
2221 | << ", inode has " << pi->rstat | |
2222 | << ", directory fragments have " << rstat; | |
7c673cae FG |
2223 | assert(!"unmatched rstat" == g_conf->mds_verify_scatter); |
2224 | } | |
2225 | // trust the dirfrag for now | |
2226 | version_t v = pi->rstat.version; | |
2227 | if (pi->rstat.rctime > rstat.rctime) | |
2228 | rstat.rctime = pi->rstat.rctime; | |
2229 | pi->rstat = rstat; | |
2230 | pi->rstat.version = v; | |
2231 | } | |
2232 | } | |
2233 | ||
2234 | mdcache->broadcast_quota_to_client(this); | |
2235 | } | |
2236 | break; | |
2237 | ||
2238 | case CEPH_LOCK_IDFT: | |
2239 | break; | |
2240 | ||
2241 | default: | |
2242 | ceph_abort(); | |
2243 | } | |
2244 | } | |
2245 | ||
2246 | void CInode::finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob) | |
2247 | { | |
2248 | dout(10) << "finish_scatter_gather_update_accounted " << type << " on " << *this << dendl; | |
2249 | assert(is_auth()); | |
2250 | ||
2251 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
2252 | p != dirfrags.end(); | |
2253 | ++p) { | |
2254 | CDir *dir = p->second; | |
2255 | if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen()) | |
2256 | continue; | |
2257 | ||
2258 | if (type == CEPH_LOCK_IDFT) | |
2259 | continue; // nothing to do. | |
2260 | ||
2261 | dout(10) << " journaling updated frag accounted_ on " << *dir << dendl; | |
2262 | assert(dir->is_projected()); | |
2263 | fnode_t *pf = dir->get_projected_fnode(); | |
2264 | pf->version = dir->pre_dirty(); | |
2265 | mut->add_projected_fnode(dir); | |
2266 | metablob->add_dir(dir, true); | |
2267 | mut->auth_pin(dir); | |
2268 | ||
2269 | if (type == CEPH_LOCK_INEST) | |
2270 | dir->assimilate_dirty_rstat_inodes_finish(mut, metablob); | |
2271 | } | |
2272 | } | |
2273 | ||
2274 | // waiting | |
2275 | ||
2276 | bool CInode::is_frozen() const | |
2277 | { | |
2278 | if (is_frozen_inode()) return true; | |
2279 | if (parent && parent->dir->is_frozen()) return true; | |
2280 | return false; | |
2281 | } | |
2282 | ||
2283 | bool CInode::is_frozen_dir() const | |
2284 | { | |
2285 | if (parent && parent->dir->is_frozen_dir()) return true; | |
2286 | return false; | |
2287 | } | |
2288 | ||
2289 | bool CInode::is_freezing() const | |
2290 | { | |
2291 | if (is_freezing_inode()) return true; | |
2292 | if (parent && parent->dir->is_freezing()) return true; | |
2293 | return false; | |
2294 | } | |
2295 | ||
2296 | void CInode::add_dir_waiter(frag_t fg, MDSInternalContextBase *c) | |
2297 | { | |
2298 | if (waiting_on_dir.empty()) | |
2299 | get(PIN_DIRWAITER); | |
2300 | waiting_on_dir[fg].push_back(c); | |
2301 | dout(10) << "add_dir_waiter frag " << fg << " " << c << " on " << *this << dendl; | |
2302 | } | |
2303 | ||
2304 | void CInode::take_dir_waiting(frag_t fg, list<MDSInternalContextBase*>& ls) | |
2305 | { | |
2306 | if (waiting_on_dir.empty()) | |
2307 | return; | |
2308 | ||
2309 | compact_map<frag_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dir.find(fg); | |
2310 | if (p != waiting_on_dir.end()) { | |
2311 | dout(10) << "take_dir_waiting frag " << fg << " on " << *this << dendl; | |
2312 | ls.splice(ls.end(), p->second); | |
2313 | waiting_on_dir.erase(p); | |
2314 | ||
2315 | if (waiting_on_dir.empty()) | |
2316 | put(PIN_DIRWAITER); | |
2317 | } | |
2318 | } | |
2319 | ||
2320 | void CInode::add_waiter(uint64_t tag, MDSInternalContextBase *c) | |
2321 | { | |
2322 | dout(10) << "add_waiter tag " << std::hex << tag << std::dec << " " << c | |
2323 | << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH) | |
2324 | << " !frozen " << !is_frozen_inode() | |
2325 | << " !freezing " << !is_freezing_inode() | |
2326 | << dendl; | |
2327 | // wait on the directory? | |
2328 | // make sure its not the inode that is explicitly ambiguous|freezing|frozen | |
2329 | if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) || | |
2330 | ((tag & WAIT_UNFREEZE) && | |
2331 | !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) { | |
2332 | dout(15) << "passing waiter up tree" << dendl; | |
2333 | parent->dir->add_waiter(tag, c); | |
2334 | return; | |
2335 | } | |
2336 | dout(15) << "taking waiter here" << dendl; | |
2337 | MDSCacheObject::add_waiter(tag, c); | |
2338 | } | |
2339 | ||
2340 | void CInode::take_waiting(uint64_t mask, list<MDSInternalContextBase*>& ls) | |
2341 | { | |
2342 | if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) { | |
2343 | // take all dentry waiters | |
2344 | while (!waiting_on_dir.empty()) { | |
2345 | compact_map<frag_t, list<MDSInternalContextBase*> >::iterator p = waiting_on_dir.begin(); | |
2346 | dout(10) << "take_waiting dirfrag " << p->first << " on " << *this << dendl; | |
2347 | ls.splice(ls.end(), p->second); | |
2348 | waiting_on_dir.erase(p); | |
2349 | } | |
2350 | put(PIN_DIRWAITER); | |
2351 | } | |
2352 | ||
2353 | // waiting | |
2354 | MDSCacheObject::take_waiting(mask, ls); | |
2355 | } | |
2356 | ||
2357 | bool CInode::freeze_inode(int auth_pin_allowance) | |
2358 | { | |
2359 | assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins | |
2360 | assert(auth_pins >= auth_pin_allowance); | |
2361 | if (auth_pins > auth_pin_allowance) { | |
2362 | dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; | |
2363 | auth_pin_freeze_allowance = auth_pin_allowance; | |
2364 | get(PIN_FREEZING); | |
2365 | state_set(STATE_FREEZING); | |
2366 | return false; | |
2367 | } | |
2368 | ||
2369 | dout(10) << "freeze_inode - frozen" << dendl; | |
2370 | assert(auth_pins == auth_pin_allowance); | |
2371 | if (!state_test(STATE_FROZEN)) { | |
2372 | get(PIN_FROZEN); | |
2373 | state_set(STATE_FROZEN); | |
2374 | } | |
2375 | return true; | |
2376 | } | |
2377 | ||
2378 | void CInode::unfreeze_inode(list<MDSInternalContextBase*>& finished) | |
2379 | { | |
2380 | dout(10) << "unfreeze_inode" << dendl; | |
2381 | if (state_test(STATE_FREEZING)) { | |
2382 | state_clear(STATE_FREEZING); | |
2383 | put(PIN_FREEZING); | |
2384 | } else if (state_test(STATE_FROZEN)) { | |
2385 | state_clear(STATE_FROZEN); | |
2386 | put(PIN_FROZEN); | |
2387 | } else | |
2388 | ceph_abort(); | |
2389 | take_waiting(WAIT_UNFREEZE, finished); | |
2390 | } | |
2391 | ||
2392 | void CInode::unfreeze_inode() | |
2393 | { | |
2394 | list<MDSInternalContextBase*> finished; | |
2395 | unfreeze_inode(finished); | |
2396 | mdcache->mds->queue_waiters(finished); | |
2397 | } | |
2398 | ||
2399 | void CInode::freeze_auth_pin() | |
2400 | { | |
2401 | assert(state_test(CInode::STATE_FROZEN)); | |
2402 | state_set(CInode::STATE_FROZENAUTHPIN); | |
2403 | } | |
2404 | ||
2405 | void CInode::unfreeze_auth_pin() | |
2406 | { | |
2407 | assert(state_test(CInode::STATE_FROZENAUTHPIN)); | |
2408 | state_clear(CInode::STATE_FROZENAUTHPIN); | |
2409 | if (!state_test(STATE_FREEZING|STATE_FROZEN)) { | |
2410 | list<MDSInternalContextBase*> finished; | |
2411 | take_waiting(WAIT_UNFREEZE, finished); | |
2412 | mdcache->mds->queue_waiters(finished); | |
2413 | } | |
2414 | } | |
2415 | ||
2416 | void CInode::clear_ambiguous_auth(list<MDSInternalContextBase*>& finished) | |
2417 | { | |
2418 | assert(state_test(CInode::STATE_AMBIGUOUSAUTH)); | |
2419 | state_clear(CInode::STATE_AMBIGUOUSAUTH); | |
2420 | take_waiting(CInode::WAIT_SINGLEAUTH, finished); | |
2421 | } | |
2422 | ||
2423 | void CInode::clear_ambiguous_auth() | |
2424 | { | |
2425 | list<MDSInternalContextBase*> finished; | |
2426 | clear_ambiguous_auth(finished); | |
2427 | mdcache->mds->queue_waiters(finished); | |
2428 | } | |
2429 | ||
2430 | // auth_pins | |
2431 | bool CInode::can_auth_pin() const { | |
2432 | if (!is_auth() || is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) | |
2433 | return false; | |
2434 | if (parent) | |
2435 | return parent->can_auth_pin(); | |
2436 | return true; | |
2437 | } | |
2438 | ||
2439 | void CInode::auth_pin(void *by) | |
2440 | { | |
2441 | if (auth_pins == 0) | |
2442 | get(PIN_AUTHPIN); | |
2443 | auth_pins++; | |
2444 | ||
2445 | #ifdef MDS_AUTHPIN_SET | |
2446 | auth_pin_set.insert(by); | |
2447 | #endif | |
2448 | ||
2449 | dout(10) << "auth_pin by " << by << " on " << *this | |
2450 | << " now " << auth_pins << "+" << nested_auth_pins | |
2451 | << dendl; | |
2452 | ||
2453 | if (parent) | |
2454 | parent->adjust_nested_auth_pins(1, 1, this); | |
2455 | } | |
2456 | ||
2457 | void CInode::auth_unpin(void *by) | |
2458 | { | |
2459 | auth_pins--; | |
2460 | ||
2461 | #ifdef MDS_AUTHPIN_SET | |
2462 | assert(auth_pin_set.count(by)); | |
2463 | auth_pin_set.erase(auth_pin_set.find(by)); | |
2464 | #endif | |
2465 | ||
2466 | if (auth_pins == 0) | |
2467 | put(PIN_AUTHPIN); | |
2468 | ||
2469 | dout(10) << "auth_unpin by " << by << " on " << *this | |
2470 | << " now " << auth_pins << "+" << nested_auth_pins | |
2471 | << dendl; | |
2472 | ||
2473 | assert(auth_pins >= 0); | |
2474 | ||
2475 | if (parent) | |
2476 | parent->adjust_nested_auth_pins(-1, -1, by); | |
2477 | ||
2478 | if (is_freezing_inode() && | |
2479 | auth_pins == auth_pin_freeze_allowance) { | |
2480 | dout(10) << "auth_unpin freezing!" << dendl; | |
2481 | get(PIN_FROZEN); | |
2482 | put(PIN_FREEZING); | |
2483 | state_clear(STATE_FREEZING); | |
2484 | state_set(STATE_FROZEN); | |
2485 | finish_waiting(WAIT_FROZEN); | |
2486 | } | |
2487 | } | |
2488 | ||
2489 | void CInode::adjust_nested_auth_pins(int a, void *by) | |
2490 | { | |
2491 | assert(a); | |
2492 | nested_auth_pins += a; | |
2493 | dout(35) << "adjust_nested_auth_pins by " << by | |
2494 | << " change " << a << " yields " | |
2495 | << auth_pins << "+" << nested_auth_pins << dendl; | |
2496 | assert(nested_auth_pins >= 0); | |
2497 | ||
2498 | if (g_conf->mds_debug_auth_pins) { | |
2499 | // audit | |
2500 | int s = 0; | |
2501 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
2502 | p != dirfrags.end(); | |
2503 | ++p) { | |
2504 | CDir *dir = p->second; | |
2505 | if (!dir->is_subtree_root() && dir->get_cum_auth_pins()) | |
2506 | s++; | |
2507 | } | |
2508 | assert(s == nested_auth_pins); | |
2509 | } | |
2510 | ||
2511 | if (parent) | |
2512 | parent->adjust_nested_auth_pins(a, 0, by); | |
2513 | } | |
2514 | ||
2515 | ||
2516 | // authority | |
2517 | ||
2518 | mds_authority_t CInode::authority() const | |
2519 | { | |
2520 | if (inode_auth.first >= 0) | |
2521 | return inode_auth; | |
2522 | ||
2523 | if (parent) | |
2524 | return parent->dir->authority(); | |
2525 | ||
2526 | // new items that are not yet linked in (in the committed plane) belong | |
2527 | // to their first parent. | |
2528 | if (!projected_parent.empty()) | |
2529 | return projected_parent.front()->dir->authority(); | |
2530 | ||
2531 | return CDIR_AUTH_UNDEF; | |
2532 | } | |
2533 | ||
2534 | ||
2535 | // SNAP | |
2536 | ||
2537 | snapid_t CInode::get_oldest_snap() | |
2538 | { | |
2539 | snapid_t t = first; | |
2540 | if (!old_inodes.empty()) | |
2541 | t = old_inodes.begin()->second.first; | |
2542 | return MIN(t, oldest_snap); | |
2543 | } | |
2544 | ||
2545 | old_inode_t& CInode::cow_old_inode(snapid_t follows, bool cow_head) | |
2546 | { | |
2547 | assert(follows >= first); | |
2548 | ||
2549 | inode_t *pi = cow_head ? get_projected_inode() : get_previous_projected_inode(); | |
2550 | map<string,bufferptr> *px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs(); | |
2551 | ||
2552 | old_inode_t &old = old_inodes[follows]; | |
2553 | old.first = first; | |
2554 | old.inode = *pi; | |
2555 | old.xattrs = *px; | |
2556 | ||
2557 | if (first < oldest_snap) | |
2558 | oldest_snap = first; | |
2559 | ||
2560 | dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl; | |
2561 | ||
2562 | old.inode.trim_client_ranges(follows); | |
2563 | ||
2564 | if (g_conf->mds_snap_rstat && | |
2565 | !(old.inode.rstat == old.inode.accounted_rstat)) | |
2566 | dirty_old_rstats.insert(follows); | |
2567 | ||
2568 | first = follows+1; | |
2569 | ||
2570 | dout(10) << "cow_old_inode " << (cow_head ? "head" : "previous_head" ) | |
2571 | << " to [" << old.first << "," << follows << "] on " | |
2572 | << *this << dendl; | |
2573 | ||
2574 | return old; | |
2575 | } | |
2576 | ||
2577 | void CInode::split_old_inode(snapid_t snap) | |
2578 | { | |
2579 | compact_map<snapid_t, old_inode_t>::iterator p = old_inodes.lower_bound(snap); | |
2580 | assert(p != old_inodes.end() && p->second.first < snap); | |
2581 | ||
2582 | old_inode_t &old = old_inodes[snap - 1]; | |
2583 | old = p->second; | |
2584 | ||
2585 | p->second.first = snap; | |
2586 | dout(10) << "split_old_inode " << "[" << old.first << "," << p->first | |
2587 | << "] to [" << snap << "," << p->first << "] on " << *this << dendl; | |
2588 | } | |
2589 | ||
2590 | void CInode::pre_cow_old_inode() | |
2591 | { | |
2592 | snapid_t follows = find_snaprealm()->get_newest_seq(); | |
2593 | if (first <= follows) | |
2594 | cow_old_inode(follows, true); | |
2595 | } | |
2596 | ||
2597 | void CInode::purge_stale_snap_data(const set<snapid_t>& snaps) | |
2598 | { | |
2599 | dout(10) << "purge_stale_snap_data " << snaps << dendl; | |
2600 | ||
2601 | if (old_inodes.empty()) | |
2602 | return; | |
2603 | ||
2604 | compact_map<snapid_t,old_inode_t>::iterator p = old_inodes.begin(); | |
2605 | while (p != old_inodes.end()) { | |
2606 | set<snapid_t>::const_iterator q = snaps.lower_bound(p->second.first); | |
2607 | if (q == snaps.end() || *q > p->first) { | |
2608 | dout(10) << " purging old_inode [" << p->second.first << "," << p->first << "]" << dendl; | |
2609 | old_inodes.erase(p++); | |
2610 | } else | |
2611 | ++p; | |
2612 | } | |
2613 | } | |
2614 | ||
2615 | /* | |
2616 | * pick/create an old_inode | |
2617 | */ | |
2618 | old_inode_t * CInode::pick_old_inode(snapid_t snap) | |
2619 | { | |
2620 | compact_map<snapid_t, old_inode_t>::iterator p = old_inodes.lower_bound(snap); // p is first key >= to snap | |
2621 | if (p != old_inodes.end() && p->second.first <= snap) { | |
2622 | dout(10) << "pick_old_inode snap " << snap << " -> [" << p->second.first << "," << p->first << "]" << dendl; | |
2623 | return &p->second; | |
2624 | } | |
2625 | dout(10) << "pick_old_inode snap " << snap << " -> nothing" << dendl; | |
2626 | return NULL; | |
2627 | } | |
2628 | ||
2629 | void CInode::open_snaprealm(bool nosplit) | |
2630 | { | |
2631 | if (!snaprealm) { | |
2632 | SnapRealm *parent = find_snaprealm(); | |
2633 | snaprealm = new SnapRealm(mdcache, this); | |
2634 | if (parent) { | |
2635 | dout(10) << "open_snaprealm " << snaprealm | |
2636 | << " parent is " << parent | |
2637 | << dendl; | |
2638 | dout(30) << " siblings are " << parent->open_children << dendl; | |
2639 | snaprealm->parent = parent; | |
2640 | if (!nosplit) | |
2641 | parent->split_at(snaprealm); | |
2642 | parent->open_children.insert(snaprealm); | |
2643 | } | |
2644 | } | |
2645 | } | |
2646 | void CInode::close_snaprealm(bool nojoin) | |
2647 | { | |
2648 | if (snaprealm) { | |
2649 | dout(15) << "close_snaprealm " << *snaprealm << dendl; | |
2650 | snaprealm->close_parents(); | |
2651 | if (snaprealm->parent) { | |
2652 | snaprealm->parent->open_children.erase(snaprealm); | |
2653 | //if (!nojoin) | |
2654 | //snaprealm->parent->join(snaprealm); | |
2655 | } | |
2656 | delete snaprealm; | |
2657 | snaprealm = 0; | |
2658 | } | |
2659 | } | |
2660 | ||
2661 | SnapRealm *CInode::find_snaprealm() const | |
2662 | { | |
2663 | const CInode *cur = this; | |
2664 | while (!cur->snaprealm) { | |
2665 | if (cur->get_parent_dn()) | |
2666 | cur = cur->get_parent_dn()->get_dir()->get_inode(); | |
2667 | else if (get_projected_parent_dn()) | |
2668 | cur = cur->get_projected_parent_dn()->get_dir()->get_inode(); | |
2669 | else | |
2670 | break; | |
2671 | } | |
2672 | return cur->snaprealm; | |
2673 | } | |
2674 | ||
2675 | void CInode::encode_snap_blob(bufferlist &snapbl) | |
2676 | { | |
2677 | if (snaprealm) { | |
2678 | ::encode(snaprealm->srnode, snapbl); | |
2679 | dout(20) << "encode_snap_blob " << *snaprealm << dendl; | |
2680 | } | |
2681 | } | |
2682 | void CInode::decode_snap_blob(bufferlist& snapbl) | |
2683 | { | |
2684 | if (snapbl.length()) { | |
2685 | open_snaprealm(); | |
2686 | bufferlist::iterator p = snapbl.begin(); | |
2687 | ::decode(snaprealm->srnode, p); | |
2688 | if (is_base()) { | |
2689 | bool ok = snaprealm->_open_parents(NULL); | |
2690 | assert(ok); | |
2691 | } | |
2692 | dout(20) << "decode_snap_blob " << *snaprealm << dendl; | |
2693 | } | |
2694 | } | |
2695 | ||
2696 | void CInode::encode_snap(bufferlist& bl) | |
2697 | { | |
2698 | bufferlist snapbl; | |
2699 | encode_snap_blob(snapbl); | |
2700 | ::encode(snapbl, bl); | |
2701 | ::encode(oldest_snap, bl); | |
2702 | } | |
2703 | ||
2704 | void CInode::decode_snap(bufferlist::iterator& p) | |
2705 | { | |
2706 | bufferlist snapbl; | |
2707 | ::decode(snapbl, p); | |
2708 | ::decode(oldest_snap, p); | |
2709 | decode_snap_blob(snapbl); | |
2710 | } | |
2711 | ||
2712 | // ============================================= | |
2713 | ||
2714 | client_t CInode::calc_ideal_loner() | |
2715 | { | |
2716 | if (mdcache->is_readonly()) | |
2717 | return -1; | |
2718 | if (!mds_caps_wanted.empty()) | |
2719 | return -1; | |
2720 | ||
2721 | int n = 0; | |
2722 | client_t loner = -1; | |
2723 | for (map<client_t,Capability*>::iterator it = client_caps.begin(); | |
2724 | it != client_caps.end(); | |
2725 | ++it) | |
2726 | if (!it->second->is_stale() && | |
2727 | ((it->second->wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD)) || | |
2728 | (inode.is_dir() && !has_subtree_root_dirfrag()))) { | |
2729 | if (n) | |
2730 | return -1; | |
2731 | n++; | |
2732 | loner = it->first; | |
2733 | } | |
2734 | return loner; | |
2735 | } | |
2736 | ||
b32b8144 | 2737 | bool CInode::choose_ideal_loner() |
7c673cae FG |
2738 | { |
2739 | want_loner_cap = calc_ideal_loner(); | |
b32b8144 FG |
2740 | int changed = false; |
2741 | if (loner_cap >= 0 && loner_cap != want_loner_cap) { | |
2742 | if (!try_drop_loner()) | |
2743 | return false; | |
2744 | changed = true; | |
2745 | } | |
2746 | ||
2747 | if (want_loner_cap >= 0) { | |
2748 | if (loner_cap < 0) { | |
2749 | set_loner_cap(want_loner_cap); | |
2750 | changed = true; | |
2751 | } else | |
2752 | assert(loner_cap == want_loner_cap); | |
2753 | } | |
2754 | return changed; | |
7c673cae FG |
2755 | } |
2756 | ||
2757 | bool CInode::try_set_loner() | |
2758 | { | |
2759 | assert(want_loner_cap >= 0); | |
2760 | if (loner_cap >= 0 && loner_cap != want_loner_cap) | |
2761 | return false; | |
2762 | set_loner_cap(want_loner_cap); | |
2763 | return true; | |
2764 | } | |
2765 | ||
2766 | void CInode::set_loner_cap(client_t l) | |
2767 | { | |
2768 | loner_cap = l; | |
2769 | authlock.set_excl_client(loner_cap); | |
2770 | filelock.set_excl_client(loner_cap); | |
2771 | linklock.set_excl_client(loner_cap); | |
2772 | xattrlock.set_excl_client(loner_cap); | |
2773 | } | |
2774 | ||
2775 | bool CInode::try_drop_loner() | |
2776 | { | |
2777 | if (loner_cap < 0) | |
2778 | return true; | |
2779 | ||
2780 | int other_allowed = get_caps_allowed_by_type(CAP_ANY); | |
2781 | Capability *cap = get_client_cap(loner_cap); | |
2782 | if (!cap || | |
2783 | (cap->issued() & ~other_allowed) == 0) { | |
2784 | set_loner_cap(-1); | |
2785 | return true; | |
2786 | } | |
2787 | return false; | |
2788 | } | |
2789 | ||
2790 | ||
2791 | // choose new lock state during recovery, based on issued caps | |
2792 | void CInode::choose_lock_state(SimpleLock *lock, int allissued) | |
2793 | { | |
2794 | int shift = lock->get_cap_shift(); | |
2795 | int issued = (allissued >> shift) & lock->get_cap_mask(); | |
2796 | if (is_auth()) { | |
2797 | if (lock->is_xlocked()) { | |
2798 | // do nothing here | |
2799 | } else if (lock->get_state() != LOCK_MIX) { | |
2800 | if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER)) | |
2801 | lock->set_state(LOCK_EXCL); | |
2802 | else if (issued & CEPH_CAP_GWR) | |
2803 | lock->set_state(LOCK_MIX); | |
2804 | else if (lock->is_dirty()) { | |
2805 | if (is_replicated()) | |
2806 | lock->set_state(LOCK_MIX); | |
2807 | else | |
2808 | lock->set_state(LOCK_LOCK); | |
2809 | } else | |
2810 | lock->set_state(LOCK_SYNC); | |
2811 | } | |
2812 | } else { | |
2813 | // our states have already been chosen during rejoin. | |
2814 | if (lock->is_xlocked()) | |
2815 | assert(lock->get_state() == LOCK_LOCK); | |
2816 | } | |
2817 | } | |
2818 | ||
2819 | void CInode::choose_lock_states(int dirty_caps) | |
2820 | { | |
2821 | int issued = get_caps_issued() | dirty_caps; | |
b32b8144 FG |
2822 | if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR))) |
2823 | choose_ideal_loner(); | |
7c673cae FG |
2824 | choose_lock_state(&filelock, issued); |
2825 | choose_lock_state(&nestlock, issued); | |
2826 | choose_lock_state(&dirfragtreelock, issued); | |
2827 | choose_lock_state(&authlock, issued); | |
2828 | choose_lock_state(&xattrlock, issued); | |
2829 | choose_lock_state(&linklock, issued); | |
2830 | } | |
2831 | ||
2832 | Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm *conrealm) | |
2833 | { | |
2834 | if (client_caps.empty()) { | |
2835 | get(PIN_CAPS); | |
2836 | if (conrealm) | |
2837 | containing_realm = conrealm; | |
2838 | else | |
2839 | containing_realm = find_snaprealm(); | |
2840 | containing_realm->inodes_with_caps.push_back(&item_caps); | |
2841 | dout(10) << "add_client_cap first cap, joining realm " << *containing_realm << dendl; | |
2842 | } | |
2843 | ||
2844 | if (client_caps.empty()) | |
2845 | mdcache->num_inodes_with_caps++; | |
2846 | ||
2847 | Capability *cap = new Capability(this, ++mdcache->last_cap_id, client); | |
2848 | assert(client_caps.count(client) == 0); | |
2849 | client_caps[client] = cap; | |
2850 | ||
2851 | session->add_cap(cap); | |
2852 | if (session->is_stale()) | |
2853 | cap->mark_stale(); | |
2854 | ||
2855 | cap->client_follows = first-1; | |
2856 | ||
2857 | containing_realm->add_cap(client, cap); | |
2858 | ||
2859 | return cap; | |
2860 | } | |
2861 | ||
2862 | void CInode::remove_client_cap(client_t client) | |
2863 | { | |
2864 | assert(client_caps.count(client) == 1); | |
2865 | Capability *cap = client_caps[client]; | |
2866 | ||
2867 | cap->item_session_caps.remove_myself(); | |
2868 | cap->item_revoking_caps.remove_myself(); | |
2869 | cap->item_client_revoking_caps.remove_myself(); | |
2870 | containing_realm->remove_cap(client, cap); | |
2871 | ||
2872 | if (client == loner_cap) | |
2873 | loner_cap = -1; | |
2874 | ||
2875 | delete cap; | |
2876 | client_caps.erase(client); | |
2877 | if (client_caps.empty()) { | |
2878 | dout(10) << "remove_client_cap last cap, leaving realm " << *containing_realm << dendl; | |
2879 | put(PIN_CAPS); | |
2880 | item_caps.remove_myself(); | |
2881 | containing_realm = NULL; | |
2882 | item_open_file.remove_myself(); // unpin logsegment | |
2883 | mdcache->num_inodes_with_caps--; | |
2884 | } | |
2885 | ||
2886 | //clean up advisory locks | |
2887 | bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false; | |
2888 | bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false; | |
2889 | if (fcntl_removed || flock_removed) { | |
2890 | list<MDSInternalContextBase*> waiters; | |
2891 | take_waiting(CInode::WAIT_FLOCK, waiters); | |
2892 | mdcache->mds->queue_waiters(waiters); | |
2893 | } | |
2894 | } | |
2895 | ||
2896 | void CInode::move_to_realm(SnapRealm *realm) | |
2897 | { | |
2898 | dout(10) << "move_to_realm joining realm " << *realm | |
2899 | << ", leaving realm " << *containing_realm << dendl; | |
2900 | for (map<client_t,Capability*>::iterator q = client_caps.begin(); | |
2901 | q != client_caps.end(); | |
2902 | ++q) { | |
2903 | containing_realm->remove_cap(q->first, q->second); | |
2904 | realm->add_cap(q->first, q->second); | |
2905 | } | |
2906 | item_caps.remove_myself(); | |
2907 | realm->inodes_with_caps.push_back(&item_caps); | |
2908 | containing_realm = realm; | |
2909 | } | |
2910 | ||
2911 | Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session) | |
2912 | { | |
2913 | Capability *cap = get_client_cap(client); | |
2914 | if (cap) { | |
2915 | // FIXME? | |
2916 | cap->merge(icr.capinfo.wanted, icr.capinfo.issued); | |
2917 | } else { | |
2918 | cap = add_client_cap(client, session); | |
2919 | cap->set_cap_id(icr.capinfo.cap_id); | |
2920 | cap->set_wanted(icr.capinfo.wanted); | |
2921 | cap->issue_norevoke(icr.capinfo.issued); | |
2922 | cap->reset_seq(); | |
2923 | } | |
2924 | cap->set_last_issue_stamp(ceph_clock_now()); | |
2925 | return cap; | |
2926 | } | |
2927 | ||
2928 | void CInode::clear_client_caps_after_export() | |
2929 | { | |
2930 | while (!client_caps.empty()) | |
2931 | remove_client_cap(client_caps.begin()->first); | |
2932 | loner_cap = -1; | |
2933 | want_loner_cap = -1; | |
2934 | mds_caps_wanted.clear(); | |
2935 | } | |
2936 | ||
2937 | void CInode::export_client_caps(map<client_t,Capability::Export>& cl) | |
2938 | { | |
2939 | for (map<client_t,Capability*>::iterator it = client_caps.begin(); | |
2940 | it != client_caps.end(); | |
2941 | ++it) { | |
2942 | cl[it->first] = it->second->make_export(); | |
2943 | } | |
2944 | } | |
2945 | ||
2946 | // caps allowed | |
2947 | int CInode::get_caps_liked() const | |
2948 | { | |
2949 | if (is_dir()) | |
2950 | return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER | |
2951 | else | |
2952 | return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO; | |
2953 | } | |
2954 | ||
2955 | int CInode::get_caps_allowed_ever() const | |
2956 | { | |
2957 | int allowed; | |
2958 | if (is_dir()) | |
2959 | allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; | |
2960 | else | |
2961 | allowed = CEPH_CAP_ANY; | |
2962 | return allowed & | |
2963 | (CEPH_CAP_PIN | | |
2964 | (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) | | |
2965 | (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) | | |
2966 | (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) | | |
2967 | (linklock.gcaps_allowed_ever() << linklock.get_cap_shift())); | |
2968 | } | |
2969 | ||
2970 | int CInode::get_caps_allowed_by_type(int type) const | |
2971 | { | |
2972 | return | |
2973 | CEPH_CAP_PIN | | |
2974 | (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) | | |
2975 | (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) | | |
2976 | (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) | | |
2977 | (linklock.gcaps_allowed(type) << linklock.get_cap_shift()); | |
2978 | } | |
2979 | ||
2980 | int CInode::get_caps_careful() const | |
2981 | { | |
2982 | return | |
2983 | (filelock.gcaps_careful() << filelock.get_cap_shift()) | | |
2984 | (authlock.gcaps_careful() << authlock.get_cap_shift()) | | |
2985 | (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) | | |
2986 | (linklock.gcaps_careful() << linklock.get_cap_shift()); | |
2987 | } | |
2988 | ||
2989 | int CInode::get_xlocker_mask(client_t client) const | |
2990 | { | |
2991 | return | |
2992 | (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) | | |
2993 | (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) | | |
2994 | (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) | | |
2995 | (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift()); | |
2996 | } | |
2997 | ||
2998 | int CInode::get_caps_allowed_for_client(Session *session, inode_t *file_i) const | |
2999 | { | |
3000 | client_t client = session->info.inst.name.num(); | |
3001 | int allowed; | |
3002 | if (client == get_loner()) { | |
3003 | // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked | |
3004 | allowed = | |
3005 | get_caps_allowed_by_type(CAP_LONER) | | |
3006 | (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client)); | |
3007 | } else { | |
3008 | allowed = get_caps_allowed_by_type(CAP_ANY); | |
3009 | } | |
3010 | ||
3011 | if (!is_dir()) { | |
3012 | if ((file_i->inline_data.version != CEPH_INLINE_NONE && | |
3013 | !session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) || | |
3014 | (!file_i->layout.pool_ns.empty() && | |
3015 | !session->connection->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))) | |
3016 | allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); | |
3017 | } | |
3018 | return allowed; | |
3019 | } | |
3020 | ||
3021 | // caps issued, wanted | |
3022 | int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker, | |
3023 | int shift, int mask) | |
3024 | { | |
3025 | int c = 0; | |
3026 | int loner = 0, other = 0, xlocker = 0; | |
3027 | if (!is_auth()) { | |
3028 | loner_cap = -1; | |
3029 | } | |
3030 | ||
3031 | for (map<client_t,Capability*>::const_iterator it = client_caps.begin(); | |
3032 | it != client_caps.end(); | |
3033 | ++it) { | |
3034 | int i = it->second->issued(); | |
3035 | c |= i; | |
3036 | if (it->first == loner_cap) | |
3037 | loner |= i; | |
3038 | else | |
3039 | other |= i; | |
3040 | xlocker |= get_xlocker_mask(it->first) & i; | |
3041 | } | |
3042 | if (ploner) *ploner = (loner >> shift) & mask; | |
3043 | if (pother) *pother = (other >> shift) & mask; | |
3044 | if (pxlocker) *pxlocker = (xlocker >> shift) & mask; | |
3045 | return (c >> shift) & mask; | |
3046 | } | |
3047 | ||
3048 | bool CInode::is_any_caps_wanted() const | |
3049 | { | |
3050 | for (map<client_t,Capability*>::const_iterator it = client_caps.begin(); | |
3051 | it != client_caps.end(); | |
3052 | ++it) | |
3053 | if (it->second->wanted()) | |
3054 | return true; | |
3055 | return false; | |
3056 | } | |
3057 | ||
3058 | int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const | |
3059 | { | |
3060 | int w = 0; | |
3061 | int loner = 0, other = 0; | |
3062 | for (map<client_t,Capability*>::const_iterator it = client_caps.begin(); | |
3063 | it != client_caps.end(); | |
3064 | ++it) { | |
3065 | if (!it->second->is_stale()) { | |
3066 | int t = it->second->wanted(); | |
3067 | w |= t; | |
3068 | if (it->first == loner_cap) | |
3069 | loner |= t; | |
3070 | else | |
3071 | other |= t; | |
3072 | } | |
3073 | //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; | |
3074 | } | |
3075 | if (is_auth()) | |
3076 | for (compact_map<int,int>::const_iterator it = mds_caps_wanted.begin(); | |
3077 | it != mds_caps_wanted.end(); | |
3078 | ++it) { | |
3079 | w |= it->second; | |
3080 | other |= it->second; | |
3081 | //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; | |
3082 | } | |
3083 | if (ploner) *ploner = (loner >> shift) & mask; | |
3084 | if (pother) *pother = (other >> shift) & mask; | |
3085 | return (w >> shift) & mask; | |
3086 | } | |
3087 | ||
3088 | bool CInode::issued_caps_need_gather(SimpleLock *lock) | |
3089 | { | |
3090 | int loner_issued, other_issued, xlocker_issued; | |
3091 | get_caps_issued(&loner_issued, &other_issued, &xlocker_issued, | |
3092 | lock->get_cap_shift(), lock->get_cap_mask()); | |
3093 | if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) || | |
3094 | (other_issued & ~lock->gcaps_allowed(CAP_ANY)) || | |
3095 | (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER))) | |
3096 | return true; | |
3097 | return false; | |
3098 | } | |
3099 | ||
3100 | void CInode::replicate_relax_locks() | |
3101 | { | |
3102 | //dout(10) << " relaxing locks on " << *this << dendl; | |
3103 | assert(is_auth()); | |
3104 | assert(!is_replicated()); | |
3105 | ||
3106 | authlock.replicate_relax(); | |
3107 | linklock.replicate_relax(); | |
3108 | dirfragtreelock.replicate_relax(); | |
3109 | filelock.replicate_relax(); | |
3110 | xattrlock.replicate_relax(); | |
3111 | snaplock.replicate_relax(); | |
3112 | nestlock.replicate_relax(); | |
3113 | flocklock.replicate_relax(); | |
3114 | policylock.replicate_relax(); | |
3115 | } | |
3116 | ||
3117 | ||
3118 | ||
3119 | // ============================================= | |
3120 | ||
3121 | int CInode::encode_inodestat(bufferlist& bl, Session *session, | |
3122 | SnapRealm *dir_realm, | |
3123 | snapid_t snapid, | |
3124 | unsigned max_bytes, | |
3125 | int getattr_caps) | |
3126 | { | |
31f18b77 | 3127 | client_t client = session->info.inst.name.num(); |
7c673cae FG |
3128 | assert(snapid); |
3129 | assert(session->connection); | |
3130 | ||
3131 | bool valid = true; | |
3132 | ||
3133 | // pick a version! | |
3134 | inode_t *oi = &inode; | |
3135 | inode_t *pi = get_projected_inode(); | |
3136 | ||
3137 | map<string, bufferptr> *pxattrs = 0; | |
3138 | ||
3139 | if (snapid != CEPH_NOSNAP) { | |
3140 | ||
3141 | // for now at least, old_inodes is only defined/valid on the auth | |
3142 | if (!is_auth()) | |
3143 | valid = false; | |
3144 | ||
3145 | if (is_multiversion()) { | |
3146 | compact_map<snapid_t,old_inode_t>::iterator p = old_inodes.lower_bound(snapid); | |
3147 | if (p != old_inodes.end()) { | |
3148 | if (p->second.first > snapid) { | |
3149 | if (p != old_inodes.begin()) | |
3150 | --p; | |
3151 | } | |
3152 | if (p->second.first <= snapid && snapid <= p->first) { | |
3153 | dout(15) << "encode_inodestat snapid " << snapid | |
3154 | << " to old_inode [" << p->second.first << "," << p->first << "]" | |
3155 | << " " << p->second.inode.rstat | |
3156 | << dendl; | |
3157 | pi = oi = &p->second.inode; | |
3158 | pxattrs = &p->second.xattrs; | |
3159 | } else { | |
3160 | // snapshoted remote dentry can result this | |
3161 | dout(0) << "encode_inodestat old_inode for snapid " << snapid | |
3162 | << " not found" << dendl; | |
3163 | } | |
3164 | } | |
3165 | } else if (snapid < first || snapid > last) { | |
3166 | // snapshoted remote dentry can result this | |
3167 | dout(0) << "encode_inodestat [" << first << "," << last << "]" | |
3168 | << " not match snapid " << snapid << dendl; | |
3169 | } | |
3170 | } | |
3171 | ||
3172 | SnapRealm *realm = find_snaprealm(); | |
3173 | ||
3174 | bool no_caps = !valid || | |
3175 | session->is_stale() || | |
3176 | (dir_realm && realm != dir_realm) || | |
3177 | is_frozen() || | |
3178 | state_test(CInode::STATE_EXPORTINGCAPS); | |
3179 | if (no_caps) | |
3180 | dout(20) << "encode_inodestat no caps" | |
3181 | << (!valid?", !valid":"") | |
3182 | << (session->is_stale()?", session stale ":"") | |
3183 | << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"") | |
3184 | << (is_frozen()?", frozen inode":"") | |
3185 | << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"") | |
3186 | << dendl; | |
3187 | ||
3188 | ||
3189 | // "fake" a version that is old (stable) version, +1 if projected. | |
3190 | version_t version = (oi->version * 2) + is_projected(); | |
3191 | ||
3192 | Capability *cap = get_client_cap(client); | |
3193 | bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client; | |
3194 | //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL)); | |
3195 | bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client; | |
3196 | bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client; | |
3197 | bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client; | |
3198 | ||
3199 | bool plocal = versionlock.get_last_wrlock_client() == client; | |
3200 | bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client; | |
3201 | ||
3202 | inode_t *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi; | |
3203 | ||
3204 | dout(20) << " pfile " << pfile << " pauth " << pauth | |
3205 | << " plink " << plink << " pxattr " << pxattr | |
3206 | << " plocal " << plocal | |
3207 | << " ctime " << any_i->ctime | |
3208 | << " valid=" << valid << dendl; | |
3209 | ||
3210 | // file | |
3211 | inode_t *file_i = pfile ? pi:oi; | |
3212 | file_layout_t layout; | |
3213 | if (is_dir()) { | |
3214 | layout = (ppolicy ? pi : oi)->layout; | |
3215 | } else { | |
3216 | layout = file_i->layout; | |
3217 | } | |
3218 | ||
3219 | // max_size is min of projected, actual | |
3220 | uint64_t max_size = | |
3221 | MIN(oi->client_ranges.count(client) ? | |
3222 | oi->client_ranges[client].range.last : 0, | |
3223 | pi->client_ranges.count(client) ? | |
3224 | pi->client_ranges[client].range.last : 0); | |
3225 | ||
3226 | // inline data | |
3227 | version_t inline_version = 0; | |
3228 | bufferlist inline_data; | |
3229 | if (file_i->inline_data.version == CEPH_INLINE_NONE) { | |
3230 | inline_version = CEPH_INLINE_NONE; | |
3231 | } else if ((!cap && !no_caps) || | |
3232 | (cap && cap->client_inline_version < file_i->inline_data.version) || | |
3233 | (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data | |
3234 | inline_version = file_i->inline_data.version; | |
3235 | if (file_i->inline_data.length() > 0) | |
3236 | inline_data = file_i->inline_data.get_data(); | |
3237 | } | |
3238 | ||
3239 | // nest (do same as file... :/) | |
3240 | if (cap) { | |
3241 | cap->last_rbytes = file_i->rstat.rbytes; | |
3242 | cap->last_rsize = file_i->rstat.rsize(); | |
3243 | } | |
3244 | ||
3245 | // auth | |
3246 | inode_t *auth_i = pauth ? pi:oi; | |
3247 | ||
3248 | // link | |
3249 | inode_t *link_i = plink ? pi:oi; | |
3250 | ||
3251 | // xattr | |
3252 | inode_t *xattr_i = pxattr ? pi:oi; | |
3253 | ||
3254 | // xattr | |
3255 | bufferlist xbl; | |
3256 | version_t xattr_version; | |
3257 | if ((!cap && !no_caps) || | |
3258 | (cap && cap->client_xattr_version < xattr_i->xattr_version) || | |
3259 | (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs | |
3260 | if (!pxattrs) | |
3261 | pxattrs = pxattr ? get_projected_xattrs() : &xattrs; | |
3262 | ::encode(*pxattrs, xbl); | |
3263 | xattr_version = xattr_i->xattr_version; | |
3264 | } else { | |
3265 | xattr_version = 0; | |
3266 | } | |
3267 | ||
3268 | // do we have room? | |
3269 | if (max_bytes) { | |
3270 | unsigned bytes = 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) + | |
3271 | sizeof(struct ceph_file_layout) + 4 + layout.pool_ns.size() + | |
3272 | sizeof(struct ceph_timespec) * 3 + | |
3273 | 4 + 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + | |
3274 | 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + | |
3275 | 4; | |
3276 | bytes += sizeof(__u32); | |
3277 | bytes += (sizeof(__u32) + sizeof(__u32)) * dirfragtree._splits.size(); | |
3278 | bytes += sizeof(__u32) + symlink.length(); | |
3279 | bytes += sizeof(__u32) + xbl.length(); | |
3280 | bytes += sizeof(version_t) + sizeof(__u32) + inline_data.length(); | |
3281 | if (bytes > max_bytes) | |
3282 | return -ENOSPC; | |
3283 | } | |
3284 | ||
3285 | ||
3286 | // encode caps | |
3287 | struct ceph_mds_reply_cap ecap; | |
3288 | if (snapid != CEPH_NOSNAP) { | |
3289 | /* | |
3290 | * snapped inodes (files or dirs) only get read-only caps. always | |
3291 | * issue everything possible, since it is read only. | |
3292 | * | |
3293 | * if a snapped inode has caps, limit issued caps based on the | |
3294 | * lock state. | |
3295 | * | |
3296 | * if it is a live inode, limit issued caps based on the lock | |
3297 | * state. | |
3298 | * | |
3299 | * do NOT adjust cap issued state, because the client always | |
3300 | * tracks caps per-snap and the mds does either per-interval or | |
3301 | * multiversion. | |
3302 | */ | |
3303 | ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE; | |
3304 | if (last == CEPH_NOSNAP || is_any_caps()) | |
3305 | ecap.caps = ecap.caps & get_caps_allowed_for_client(session, file_i); | |
3306 | ecap.seq = 0; | |
3307 | ecap.mseq = 0; | |
3308 | ecap.realm = 0; | |
3309 | } else { | |
3310 | if (!no_caps && !cap) { | |
3311 | // add a new cap | |
3312 | cap = add_client_cap(client, session, realm); | |
b32b8144 FG |
3313 | if (is_auth()) |
3314 | choose_ideal_loner(); | |
7c673cae FG |
3315 | } |
3316 | ||
3317 | int issue = 0; | |
3318 | if (!no_caps && cap) { | |
3319 | int likes = get_caps_liked(); | |
3320 | int allowed = get_caps_allowed_for_client(session, file_i); | |
3321 | issue = (cap->wanted() | likes) & allowed; | |
3322 | cap->issue_norevoke(issue); | |
3323 | issue = cap->pending(); | |
3324 | dout(10) << "encode_inodestat issuing " << ccap_string(issue) | |
3325 | << " seq " << cap->get_last_seq() << dendl; | |
3326 | } else if (cap && cap->is_new() && !dir_realm) { | |
3327 | // alway issue new caps to client, otherwise the caps get lost | |
3328 | assert(cap->is_stale()); | |
3329 | issue = cap->pending() | CEPH_CAP_PIN; | |
3330 | cap->issue_norevoke(issue); | |
3331 | dout(10) << "encode_inodestat issuing " << ccap_string(issue) | |
3332 | << " seq " << cap->get_last_seq() | |
3333 | << "(stale|new caps)" << dendl; | |
3334 | } | |
3335 | ||
3336 | if (issue) { | |
3337 | cap->set_last_issue(); | |
3338 | cap->set_last_issue_stamp(ceph_clock_now()); | |
3339 | cap->clear_new(); | |
3340 | ecap.caps = issue; | |
3341 | ecap.wanted = cap->wanted(); | |
3342 | ecap.cap_id = cap->get_cap_id(); | |
3343 | ecap.seq = cap->get_last_seq(); | |
3344 | ecap.mseq = cap->get_mseq(); | |
3345 | ecap.realm = realm->inode->ino(); | |
3346 | } else { | |
3347 | ecap.cap_id = 0; | |
3348 | ecap.caps = 0; | |
3349 | ecap.seq = 0; | |
3350 | ecap.mseq = 0; | |
3351 | ecap.realm = 0; | |
3352 | ecap.wanted = 0; | |
3353 | } | |
3354 | } | |
3355 | ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0; | |
3356 | dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps) | |
3357 | << " seq " << ecap.seq << " mseq " << ecap.mseq | |
3358 | << " xattrv " << xattr_version << " len " << xbl.length() | |
3359 | << dendl; | |
3360 | ||
3361 | if (inline_data.length() && cap) { | |
3362 | if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) { | |
3363 | dout(10) << "including inline version " << inline_version << dendl; | |
3364 | cap->client_inline_version = inline_version; | |
3365 | } else { | |
3366 | dout(10) << "dropping inline version " << inline_version << dendl; | |
3367 | inline_version = 0; | |
3368 | inline_data.clear(); | |
3369 | } | |
3370 | } | |
3371 | ||
3372 | // include those xattrs? | |
3373 | if (xbl.length() && cap) { | |
3374 | if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) { | |
3375 | dout(10) << "including xattrs version " << xattr_i->xattr_version << dendl; | |
3376 | cap->client_xattr_version = xattr_i->xattr_version; | |
3377 | } else { | |
3378 | dout(10) << "dropping xattrs version " << xattr_i->xattr_version << dendl; | |
3379 | xbl.clear(); // no xattrs .. XXX what's this about?!? | |
3380 | xattr_version = 0; | |
3381 | } | |
3382 | } | |
3383 | ||
3384 | /* | |
3385 | * note: encoding matches MClientReply::InodeStat | |
3386 | */ | |
3387 | ::encode(oi->ino, bl); | |
3388 | ::encode(snapid, bl); | |
3389 | ::encode(oi->rdev, bl); | |
3390 | ::encode(version, bl); | |
3391 | ||
3392 | ::encode(xattr_version, bl); | |
3393 | ||
3394 | ::encode(ecap, bl); | |
3395 | { | |
3396 | ceph_file_layout legacy_layout; | |
3397 | layout.to_legacy(&legacy_layout); | |
3398 | ::encode(legacy_layout, bl); | |
3399 | } | |
3400 | ::encode(any_i->ctime, bl); | |
3401 | ::encode(file_i->mtime, bl); | |
3402 | ::encode(file_i->atime, bl); | |
3403 | ::encode(file_i->time_warp_seq, bl); | |
3404 | ::encode(file_i->size, bl); | |
3405 | ::encode(max_size, bl); | |
3406 | ::encode(file_i->truncate_size, bl); | |
3407 | ::encode(file_i->truncate_seq, bl); | |
3408 | ||
3409 | ::encode(auth_i->mode, bl); | |
3410 | ::encode((uint32_t)auth_i->uid, bl); | |
3411 | ::encode((uint32_t)auth_i->gid, bl); | |
3412 | ||
3413 | ::encode(link_i->nlink, bl); | |
3414 | ||
3415 | ::encode(file_i->dirstat.nfiles, bl); | |
3416 | ::encode(file_i->dirstat.nsubdirs, bl); | |
3417 | ::encode(file_i->rstat.rbytes, bl); | |
3418 | ::encode(file_i->rstat.rfiles, bl); | |
3419 | ::encode(file_i->rstat.rsubdirs, bl); | |
3420 | ::encode(file_i->rstat.rctime, bl); | |
3421 | ||
3422 | dirfragtree.encode(bl); | |
3423 | ||
3424 | ::encode(symlink, bl); | |
3425 | if (session->connection->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) { | |
3426 | ::encode(file_i->dir_layout, bl); | |
3427 | } | |
3428 | ::encode(xbl, bl); | |
3429 | if (session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) { | |
3430 | ::encode(inline_version, bl); | |
3431 | ::encode(inline_data, bl); | |
3432 | } | |
3433 | if (session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA)) { | |
3434 | inode_t *policy_i = ppolicy ? pi : oi; | |
3435 | ::encode(policy_i->quota, bl); | |
3436 | } | |
3437 | if (session->connection->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) { | |
3438 | ::encode(layout.pool_ns, bl); | |
3439 | } | |
3440 | if (session->connection->has_feature(CEPH_FEATURE_FS_BTIME)) { | |
3441 | ::encode(any_i->btime, bl); | |
3442 | ::encode(any_i->change_attr, bl); | |
3443 | } | |
3444 | ||
3445 | return valid; | |
3446 | } | |
3447 | ||
3448 | void CInode::encode_cap_message(MClientCaps *m, Capability *cap) | |
3449 | { | |
3450 | assert(cap); | |
3451 | ||
3452 | client_t client = cap->get_client(); | |
3453 | ||
3454 | bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL); | |
3455 | bool pauth = authlock.is_xlocked_by_client(client); | |
3456 | bool plink = linklock.is_xlocked_by_client(client); | |
3457 | bool pxattr = xattrlock.is_xlocked_by_client(client); | |
3458 | ||
3459 | inode_t *oi = &inode; | |
3460 | inode_t *pi = get_projected_inode(); | |
3461 | inode_t *i = (pfile|pauth|plink|pxattr) ? pi : oi; | |
3462 | ||
3463 | dout(20) << "encode_cap_message pfile " << pfile | |
3464 | << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr | |
3465 | << " ctime " << i->ctime << dendl; | |
3466 | ||
3467 | i = pfile ? pi:oi; | |
3468 | m->set_layout(i->layout); | |
3469 | m->size = i->size; | |
3470 | m->truncate_seq = i->truncate_seq; | |
3471 | m->truncate_size = i->truncate_size; | |
3472 | m->mtime = i->mtime; | |
3473 | m->atime = i->atime; | |
3474 | m->ctime = i->ctime; | |
3475 | m->change_attr = i->change_attr; | |
3476 | m->time_warp_seq = i->time_warp_seq; | |
3477 | ||
3478 | if (cap->client_inline_version < i->inline_data.version) { | |
3479 | m->inline_version = cap->client_inline_version = i->inline_data.version; | |
3480 | if (i->inline_data.length() > 0) | |
3481 | m->inline_data = i->inline_data.get_data(); | |
3482 | } else { | |
3483 | m->inline_version = 0; | |
3484 | } | |
3485 | ||
3486 | // max_size is min of projected, actual. | |
3487 | uint64_t oldms = oi->client_ranges.count(client) ? oi->client_ranges[client].range.last : 0; | |
3488 | uint64_t newms = pi->client_ranges.count(client) ? pi->client_ranges[client].range.last : 0; | |
3489 | m->max_size = MIN(oldms, newms); | |
3490 | ||
3491 | i = pauth ? pi:oi; | |
3492 | m->head.mode = i->mode; | |
3493 | m->head.uid = i->uid; | |
3494 | m->head.gid = i->gid; | |
3495 | ||
3496 | i = plink ? pi:oi; | |
3497 | m->head.nlink = i->nlink; | |
3498 | ||
3499 | i = pxattr ? pi:oi; | |
3500 | map<string,bufferptr> *ix = pxattr ? get_projected_xattrs() : &xattrs; | |
3501 | if ((cap->pending() & CEPH_CAP_XATTR_SHARED) && | |
3502 | i->xattr_version > cap->client_xattr_version) { | |
3503 | dout(10) << " including xattrs v " << i->xattr_version << dendl; | |
3504 | ::encode(*ix, m->xattrbl); | |
3505 | m->head.xattr_version = i->xattr_version; | |
3506 | cap->client_xattr_version = i->xattr_version; | |
3507 | } | |
3508 | } | |
3509 | ||
3510 | ||
3511 | ||
3512 | void CInode::_encode_base(bufferlist& bl, uint64_t features) | |
3513 | { | |
3514 | ::encode(first, bl); | |
3515 | ::encode(inode, bl, features); | |
3516 | ::encode(symlink, bl); | |
3517 | ::encode(dirfragtree, bl); | |
3518 | ::encode(xattrs, bl); | |
3519 | ::encode(old_inodes, bl, features); | |
3520 | ::encode(damage_flags, bl); | |
3521 | encode_snap(bl); | |
3522 | } | |
3523 | void CInode::_decode_base(bufferlist::iterator& p) | |
3524 | { | |
3525 | ::decode(first, p); | |
3526 | ::decode(inode, p); | |
3527 | ::decode(symlink, p); | |
3528 | ::decode(dirfragtree, p); | |
3529 | ::decode(xattrs, p); | |
3530 | ::decode(old_inodes, p); | |
3531 | ::decode(damage_flags, p); | |
3532 | decode_snap(p); | |
3533 | } | |
3534 | ||
3535 | void CInode::_encode_locks_full(bufferlist& bl) | |
3536 | { | |
3537 | ::encode(authlock, bl); | |
3538 | ::encode(linklock, bl); | |
3539 | ::encode(dirfragtreelock, bl); | |
3540 | ::encode(filelock, bl); | |
3541 | ::encode(xattrlock, bl); | |
3542 | ::encode(snaplock, bl); | |
3543 | ::encode(nestlock, bl); | |
3544 | ::encode(flocklock, bl); | |
3545 | ::encode(policylock, bl); | |
3546 | ||
3547 | ::encode(loner_cap, bl); | |
3548 | } | |
3549 | void CInode::_decode_locks_full(bufferlist::iterator& p) | |
3550 | { | |
3551 | ::decode(authlock, p); | |
3552 | ::decode(linklock, p); | |
3553 | ::decode(dirfragtreelock, p); | |
3554 | ::decode(filelock, p); | |
3555 | ::decode(xattrlock, p); | |
3556 | ::decode(snaplock, p); | |
3557 | ::decode(nestlock, p); | |
3558 | ::decode(flocklock, p); | |
3559 | ::decode(policylock, p); | |
3560 | ||
3561 | ::decode(loner_cap, p); | |
3562 | set_loner_cap(loner_cap); | |
3563 | want_loner_cap = loner_cap; // for now, we'll eval() shortly. | |
3564 | } | |
3565 | ||
b32b8144 | 3566 | void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover) |
7c673cae FG |
3567 | { |
3568 | authlock.encode_state_for_replica(bl); | |
3569 | linklock.encode_state_for_replica(bl); | |
3570 | dirfragtreelock.encode_state_for_replica(bl); | |
3571 | filelock.encode_state_for_replica(bl); | |
3572 | nestlock.encode_state_for_replica(bl); | |
3573 | xattrlock.encode_state_for_replica(bl); | |
3574 | snaplock.encode_state_for_replica(bl); | |
3575 | flocklock.encode_state_for_replica(bl); | |
3576 | policylock.encode_state_for_replica(bl); | |
b32b8144 | 3577 | ::encode(need_recover, bl); |
7c673cae | 3578 | } |
b32b8144 | 3579 | |
7c673cae FG |
3580 | void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep) |
3581 | { | |
3582 | authlock.encode_state_for_replica(bl); | |
3583 | linklock.encode_state_for_replica(bl); | |
3584 | dirfragtreelock.encode_state_for_rejoin(bl, rep); | |
3585 | filelock.encode_state_for_rejoin(bl, rep); | |
3586 | nestlock.encode_state_for_rejoin(bl, rep); | |
3587 | xattrlock.encode_state_for_replica(bl); | |
3588 | snaplock.encode_state_for_replica(bl); | |
3589 | flocklock.encode_state_for_replica(bl); | |
3590 | policylock.encode_state_for_replica(bl); | |
3591 | } | |
b32b8144 | 3592 | |
7c673cae FG |
3593 | void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new) |
3594 | { | |
3595 | authlock.decode_state(p, is_new); | |
3596 | linklock.decode_state(p, is_new); | |
3597 | dirfragtreelock.decode_state(p, is_new); | |
3598 | filelock.decode_state(p, is_new); | |
3599 | nestlock.decode_state(p, is_new); | |
3600 | xattrlock.decode_state(p, is_new); | |
3601 | snaplock.decode_state(p, is_new); | |
3602 | flocklock.decode_state(p, is_new); | |
3603 | policylock.decode_state(p, is_new); | |
b32b8144 FG |
3604 | |
3605 | bool need_recover; | |
3606 | ::decode(need_recover, p); | |
3607 | if (need_recover && is_new) { | |
3608 | // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock | |
3609 | // and change the object when replaying unsafe requests. | |
3610 | authlock.mark_need_recover(); | |
3611 | linklock.mark_need_recover(); | |
3612 | dirfragtreelock.mark_need_recover(); | |
3613 | filelock.mark_need_recover(); | |
3614 | nestlock.mark_need_recover(); | |
3615 | xattrlock.mark_need_recover(); | |
3616 | snaplock.mark_need_recover(); | |
3617 | flocklock.mark_need_recover(); | |
3618 | policylock.mark_need_recover(); | |
3619 | } | |
7c673cae FG |
3620 | } |
3621 | void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters, | |
b32b8144 FG |
3622 | list<SimpleLock*>& eval_locks, bool survivor) |
3623 | { | |
3624 | authlock.decode_state_rejoin(p, waiters, survivor); | |
3625 | linklock.decode_state_rejoin(p, waiters, survivor); | |
3626 | dirfragtreelock.decode_state_rejoin(p, waiters, survivor); | |
3627 | filelock.decode_state_rejoin(p, waiters, survivor); | |
3628 | nestlock.decode_state_rejoin(p, waiters, survivor); | |
3629 | xattrlock.decode_state_rejoin(p, waiters, survivor); | |
3630 | snaplock.decode_state_rejoin(p, waiters, survivor); | |
3631 | flocklock.decode_state_rejoin(p, waiters, survivor); | |
3632 | policylock.decode_state_rejoin(p, waiters, survivor); | |
7c673cae FG |
3633 | |
3634 | if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked()) | |
3635 | eval_locks.push_back(&dirfragtreelock); | |
3636 | if (!filelock.is_stable() && !filelock.is_wrlocked()) | |
3637 | eval_locks.push_back(&filelock); | |
3638 | if (!nestlock.is_stable() && !nestlock.is_wrlocked()) | |
3639 | eval_locks.push_back(&nestlock); | |
3640 | } | |
3641 | ||
3642 | ||
3643 | // IMPORT/EXPORT | |
3644 | ||
3645 | void CInode::encode_export(bufferlist& bl) | |
3646 | { | |
3647 | ENCODE_START(5, 4, bl); | |
3648 | _encode_base(bl, mdcache->mds->mdsmap->get_up_features()); | |
3649 | ||
3650 | ::encode(state, bl); | |
3651 | ||
3652 | ::encode(pop, bl); | |
3653 | ||
181888fb | 3654 | ::encode(get_replicas(), bl); |
7c673cae FG |
3655 | |
3656 | // include scatterlock info for any bounding CDirs | |
3657 | bufferlist bounding; | |
3658 | if (inode.is_dir()) | |
3659 | for (compact_map<frag_t,CDir*>::iterator p = dirfrags.begin(); | |
3660 | p != dirfrags.end(); | |
3661 | ++p) { | |
3662 | CDir *dir = p->second; | |
3663 | if (dir->state_test(CDir::STATE_EXPORTBOUND)) { | |
3664 | ::encode(p->first, bounding); | |
3665 | ::encode(dir->fnode.fragstat, bounding); | |
3666 | ::encode(dir->fnode.accounted_fragstat, bounding); | |
3667 | ::encode(dir->fnode.rstat, bounding); | |
3668 | ::encode(dir->fnode.accounted_rstat, bounding); | |
3669 | dout(10) << " encoded fragstat/rstat info for " << *dir << dendl; | |
3670 | } | |
3671 | } | |
3672 | ::encode(bounding, bl); | |
3673 | ||
3674 | _encode_locks_full(bl); | |
3675 | ||
3676 | _encode_file_locks(bl); | |
3677 | ||
3678 | ENCODE_FINISH(bl); | |
3679 | ||
3680 | get(PIN_TEMPEXPORTING); | |
3681 | } | |
3682 | ||
3683 | void CInode::finish_export(utime_t now) | |
3684 | { | |
3685 | state &= MASK_STATE_EXPORT_KEPT; | |
3686 | ||
3687 | pop.zero(now); | |
3688 | ||
3689 | // just in case! | |
3690 | //dirlock.clear_updated(); | |
3691 | ||
3692 | loner_cap = -1; | |
3693 | ||
3694 | put(PIN_TEMPEXPORTING); | |
3695 | } | |
3696 | ||
3697 | void CInode::decode_import(bufferlist::iterator& p, | |
3698 | LogSegment *ls) | |
3699 | { | |
3700 | DECODE_START(5, p); | |
3701 | ||
3702 | _decode_base(p); | |
3703 | ||
3704 | unsigned s; | |
3705 | ::decode(s, p); | |
3706 | state_set(STATE_AUTH | (s & MASK_STATE_EXPORTED)); | |
3707 | ||
3708 | if (is_dirty()) { | |
3709 | get(PIN_DIRTY); | |
3710 | _mark_dirty(ls); | |
3711 | } | |
3712 | if (is_dirty_parent()) { | |
3713 | get(PIN_DIRTYPARENT); | |
3714 | _mark_dirty_parent(ls); | |
3715 | } | |
3716 | ||
3717 | ::decode(pop, ceph_clock_now(), p); | |
3718 | ||
181888fb FG |
3719 | ::decode(get_replicas(), p); |
3720 | if (is_replicated()) | |
7c673cae FG |
3721 | get(PIN_REPLICATED); |
3722 | replica_nonce = 0; | |
3723 | ||
3724 | // decode fragstat info on bounding cdirs | |
3725 | bufferlist bounding; | |
3726 | ::decode(bounding, p); | |
3727 | bufferlist::iterator q = bounding.begin(); | |
3728 | while (!q.end()) { | |
3729 | frag_t fg; | |
3730 | ::decode(fg, q); | |
3731 | CDir *dir = get_dirfrag(fg); | |
3732 | assert(dir); // we should have all bounds open | |
3733 | ||
3734 | // Only take the remote's fragstat/rstat if we are non-auth for | |
3735 | // this dirfrag AND the lock is NOT in a scattered (MIX) state. | |
3736 | // We know lock is stable, and MIX is the only state in which | |
3737 | // the inode auth (who sent us this data) may not have the best | |
3738 | // info. | |
3739 | ||
3740 | // HMM: Are there cases where dir->is_auth() is an insufficient | |
3741 | // check because the dirfrag is under migration? That implies | |
3742 | // it is frozen (and in a SYNC or LOCK state). FIXME. | |
3743 | ||
3744 | if (dir->is_auth() || | |
3745 | filelock.get_state() == LOCK_MIX) { | |
3746 | dout(10) << " skipped fragstat info for " << *dir << dendl; | |
3747 | frag_info_t f; | |
3748 | ::decode(f, q); | |
3749 | ::decode(f, q); | |
3750 | } else { | |
3751 | ::decode(dir->fnode.fragstat, q); | |
3752 | ::decode(dir->fnode.accounted_fragstat, q); | |
3753 | dout(10) << " took fragstat info for " << *dir << dendl; | |
3754 | } | |
3755 | if (dir->is_auth() || | |
3756 | nestlock.get_state() == LOCK_MIX) { | |
3757 | dout(10) << " skipped rstat info for " << *dir << dendl; | |
3758 | nest_info_t n; | |
3759 | ::decode(n, q); | |
3760 | ::decode(n, q); | |
3761 | } else { | |
3762 | ::decode(dir->fnode.rstat, q); | |
3763 | ::decode(dir->fnode.accounted_rstat, q); | |
3764 | dout(10) << " took rstat info for " << *dir << dendl; | |
3765 | } | |
3766 | } | |
3767 | ||
3768 | _decode_locks_full(p); | |
3769 | ||
3770 | _decode_file_locks(p); | |
3771 | ||
3772 | DECODE_FINISH(p); | |
3773 | } | |
3774 | ||
3775 | ||
3776 | void InodeStoreBase::dump(Formatter *f) const | |
3777 | { | |
3778 | inode.dump(f); | |
3779 | f->dump_string("symlink", symlink); | |
3780 | f->open_array_section("old_inodes"); | |
3781 | for (compact_map<snapid_t, old_inode_t>::const_iterator i = old_inodes.begin(); | |
3782 | i != old_inodes.end(); ++i) { | |
3783 | f->open_object_section("old_inode"); | |
3784 | { | |
3785 | // The key is the last snapid, the first is in the old_inode_t | |
3786 | f->dump_int("last", i->first); | |
3787 | i->second.dump(f); | |
3788 | } | |
3789 | f->close_section(); // old_inode | |
3790 | } | |
3791 | f->close_section(); // old_inodes | |
3792 | ||
3793 | f->open_object_section("dirfragtree"); | |
3794 | dirfragtree.dump(f); | |
3795 | f->close_section(); // dirfragtree | |
3796 | } | |
3797 | ||
3798 | ||
3799 | void InodeStore::generate_test_instances(list<InodeStore*> &ls) | |
3800 | { | |
3801 | InodeStore *populated = new InodeStore; | |
3802 | populated->inode.ino = 0xdeadbeef; | |
3803 | populated->symlink = "rhubarb"; | |
3804 | ls.push_back(populated); | |
3805 | } | |
3806 | ||
3807 | void CInode::validate_disk_state(CInode::validated_data *results, | |
3808 | MDSInternalContext *fin) | |
3809 | { | |
3810 | class ValidationContinuation : public MDSContinuation { | |
3811 | public: | |
3812 | MDSInternalContext *fin; | |
3813 | CInode *in; | |
3814 | CInode::validated_data *results; | |
3815 | bufferlist bl; | |
3816 | CInode *shadow_in; | |
3817 | ||
3818 | enum { | |
3819 | START = 0, | |
3820 | BACKTRACE, | |
3821 | INODE, | |
3822 | DIRFRAGS | |
3823 | }; | |
3824 | ||
3825 | ValidationContinuation(CInode *i, | |
3826 | CInode::validated_data *data_r, | |
3827 | MDSInternalContext *fin_) : | |
3828 | MDSContinuation(i->mdcache->mds->server), | |
3829 | fin(fin_), | |
3830 | in(i), | |
3831 | results(data_r), | |
3832 | shadow_in(NULL) { | |
3833 | set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start)); | |
3834 | set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace)); | |
3835 | set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk)); | |
3836 | set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags)); | |
3837 | } | |
3838 | ||
3839 | ~ValidationContinuation() override { | |
b32b8144 FG |
3840 | if (shadow_in) { |
3841 | delete shadow_in; | |
3842 | in->mdcache->num_shadow_inodes--; | |
3843 | } | |
7c673cae FG |
3844 | } |
3845 | ||
3846 | /** | |
3847 | * Fetch backtrace and set tag if tag is non-empty | |
3848 | */ | |
3849 | void fetch_backtrace_and_tag(CInode *in, std::string tag, | |
3850 | Context *fin, int *bt_r, bufferlist *bt) | |
3851 | { | |
3852 | const int64_t pool = in->get_backtrace_pool(); | |
3853 | object_t oid = CInode::get_object_name(in->ino(), frag_t(), ""); | |
3854 | ||
3855 | ObjectOperation fetch; | |
3856 | fetch.getxattr("parent", bt, bt_r); | |
3857 | in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP, | |
3858 | NULL, 0, fin); | |
3859 | if (!tag.empty()) { | |
3860 | ObjectOperation scrub_tag; | |
3861 | bufferlist tag_bl; | |
3862 | ::encode(tag, tag_bl); | |
3863 | scrub_tag.setxattr("scrub_tag", tag_bl); | |
3864 | SnapContext snapc; | |
3865 | in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc, | |
3866 | ceph::real_clock::now(), | |
3867 | 0, NULL); | |
3868 | } | |
3869 | } | |
3870 | ||
3871 | bool _start(int rval) { | |
3872 | if (in->is_dirty()) { | |
3873 | MDCache *mdcache = in->mdcache; | |
3874 | inode_t& inode = in->inode; | |
3875 | dout(20) << "validating a dirty CInode; results will be inconclusive" | |
3876 | << dendl; | |
3877 | } | |
3878 | if (in->is_symlink()) { | |
3879 | // there's nothing to do for symlinks! | |
3880 | return true; | |
3881 | } | |
3882 | ||
3883 | C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE), | |
3884 | in->mdcache->mds->finisher); | |
3885 | ||
3886 | // Whether we have a tag to apply depends on ScrubHeader (if one is | |
3887 | // present) | |
3888 | if (in->scrub_infop) { | |
3889 | // I'm a non-orphan, so look up my ScrubHeader via my linkage | |
3890 | const std::string &tag = in->scrub_infop->header->get_tag(); | |
3891 | // Rather than using the usual CInode::fetch_backtrace, | |
3892 | // use a special variant that optionally writes a tag in the same | |
3893 | // operation. | |
3894 | fetch_backtrace_and_tag(in, tag, conf, | |
3895 | &results->backtrace.ondisk_read_retval, &bl); | |
3896 | } else { | |
3897 | // When we're invoked outside of ScrubStack we might be called | |
3898 | // on an orphaned inode like / | |
3899 | fetch_backtrace_and_tag(in, {}, conf, | |
3900 | &results->backtrace.ondisk_read_retval, &bl); | |
3901 | } | |
3902 | return false; | |
3903 | } | |
3904 | ||
3905 | bool _backtrace(int rval) { | |
3906 | // set up basic result reporting and make sure we got the data | |
3907 | results->performed_validation = true; // at least, some of it! | |
3908 | results->backtrace.checked = true; | |
3909 | ||
3910 | const int64_t pool = in->get_backtrace_pool(); | |
3911 | inode_backtrace_t& memory_backtrace = results->backtrace.memory_value; | |
3912 | in->build_backtrace(pool, memory_backtrace); | |
3913 | bool equivalent, divergent; | |
3914 | int memory_newer; | |
3915 | ||
3916 | MDCache *mdcache = in->mdcache; // For the benefit of dout | |
3917 | const inode_t& inode = in->inode; // For the benefit of dout | |
3918 | ||
3919 | // Ignore rval because it's the result of a FAILOK operation | |
3920 | // from fetch_backtrace_and_tag: the real result is in | |
3921 | // backtrace.ondisk_read_retval | |
3922 | dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl; | |
3923 | if (results->backtrace.ondisk_read_retval != 0) { | |
3924 | results->backtrace.error_str << "failed to read off disk; see retval"; | |
3925 | goto next; | |
3926 | } | |
3927 | ||
3928 | // extract the backtrace, and compare it to a newly-constructed one | |
3929 | try { | |
3930 | bufferlist::iterator p = bl.begin(); | |
3931 | ::decode(results->backtrace.ondisk_value, p); | |
3932 | dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl; | |
3933 | } catch (buffer::error&) { | |
3934 | if (results->backtrace.ondisk_read_retval == 0 && rval != 0) { | |
3935 | // Cases where something has clearly gone wrong with the overall | |
3936 | // fetch op, though we didn't get a nonzero rc from the getxattr | |
3937 | // operation. e.g. object missing. | |
3938 | results->backtrace.ondisk_read_retval = rval; | |
3939 | } | |
3940 | results->backtrace.error_str << "failed to decode on-disk backtrace (" | |
3941 | << bl.length() << " bytes)!"; | |
3942 | goto next; | |
3943 | } | |
3944 | ||
3945 | memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value, | |
3946 | &equivalent, &divergent); | |
3947 | ||
3948 | if (divergent || memory_newer < 0) { | |
3949 | // we're divergent, or on-disk version is newer | |
3950 | results->backtrace.error_str << "On-disk backtrace is divergent or newer"; | |
3951 | } else { | |
3952 | results->backtrace.passed = true; | |
3953 | } | |
3954 | next: | |
3955 | ||
3956 | if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) { | |
3957 | std::string path; | |
3958 | in->make_path_string(path); | |
d2e6a577 FG |
3959 | in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino() |
3960 | << "(" << path << "), rewriting it"; | |
7c673cae FG |
3961 | in->_mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(), |
3962 | false); | |
b32b8144 FG |
3963 | // Flag that we repaired this BT so that it won't go into damagetable |
3964 | results->backtrace.repaired = true; | |
3965 | ||
3966 | // Flag that we did some repair work so that our repair operation | |
3967 | // can be flushed at end of scrub | |
3968 | in->scrub_infop->header->set_repaired(); | |
7c673cae FG |
3969 | } |
3970 | ||
3971 | // If the inode's number was free in the InoTable, fix that | |
3972 | // (#15619) | |
3973 | { | |
3974 | InoTable *inotable = mdcache->mds->inotable; | |
3975 | ||
d2e6a577 | 3976 | dout(10) << "scrub: inotable ino = " << inode.ino << dendl; |
7c673cae FG |
3977 | dout(10) << "scrub: inotable free says " |
3978 | << inotable->is_marked_free(inode.ino) << dendl; | |
3979 | ||
3980 | if (inotable->is_marked_free(inode.ino)) { | |
3981 | LogChannelRef clog = in->mdcache->mds->clog; | |
3982 | clog->error() << "scrub: inode wrongly marked free: 0x" << std::hex | |
3983 | << inode.ino; | |
3984 | ||
3985 | if (in->scrub_infop->header->get_repair()) { | |
3986 | bool repaired = inotable->repair(inode.ino); | |
3987 | if (repaired) { | |
3988 | clog->error() << "inode table repaired for inode: 0x" << std::hex | |
3989 | << inode.ino; | |
3990 | ||
3991 | inotable->save(); | |
3992 | } else { | |
3993 | clog->error() << "Cannot repair inotable while other operations" | |
3994 | " are in progress"; | |
3995 | } | |
3996 | } | |
3997 | } | |
3998 | } | |
3999 | ||
4000 | // quit if we're a file, or kick off directory checks otherwise | |
4001 | // TODO: validate on-disk inode for non-base directories | |
4002 | if (!in->is_dir()) { | |
4003 | return true; | |
4004 | } | |
4005 | ||
4006 | return validate_directory_data(); | |
4007 | } | |
4008 | ||
4009 | bool validate_directory_data() { | |
4010 | assert(in->is_dir()); | |
4011 | ||
4012 | if (in->is_base()) { | |
b32b8144 FG |
4013 | if (!shadow_in) { |
4014 | shadow_in = new CInode(in->mdcache); | |
4015 | in->mdcache->create_unlinked_system_inode(shadow_in, in->inode.ino, in->inode.mode); | |
4016 | in->mdcache->num_shadow_inodes++; | |
4017 | } | |
7c673cae FG |
4018 | shadow_in->fetch(get_internal_callback(INODE)); |
4019 | return false; | |
4020 | } else { | |
4021 | results->inode.passed = true; | |
4022 | return check_dirfrag_rstats(); | |
4023 | } | |
4024 | } | |
4025 | ||
4026 | bool _inode_disk(int rval) { | |
4027 | results->inode.checked = true; | |
4028 | results->inode.ondisk_read_retval = rval; | |
4029 | results->inode.ondisk_value = shadow_in->inode; | |
4030 | results->inode.memory_value = in->inode; | |
4031 | ||
4032 | inode_t& si = shadow_in->inode; | |
4033 | inode_t& i = in->inode; | |
4034 | if (si.version > i.version) { | |
4035 | // uh, what? | |
4036 | results->inode.error_str << "On-disk inode is newer than in-memory one!"; | |
4037 | goto next; | |
4038 | } else { | |
4039 | bool divergent = false; | |
4040 | int r = i.compare(si, &divergent); | |
4041 | results->inode.passed = !divergent && r >= 0; | |
4042 | if (!results->inode.passed) { | |
4043 | results->inode.error_str << | |
4044 | "On-disk inode is divergent or newer than in-memory one!"; | |
4045 | goto next; | |
4046 | } | |
4047 | } | |
4048 | next: | |
4049 | return check_dirfrag_rstats(); | |
4050 | } | |
4051 | ||
4052 | bool check_dirfrag_rstats() { | |
4053 | MDSGatherBuilder gather(g_ceph_context); | |
4054 | std::list<frag_t> frags; | |
4055 | in->dirfragtree.get_leaves(frags); | |
4056 | for (list<frag_t>::iterator p = frags.begin(); | |
4057 | p != frags.end(); | |
4058 | ++p) { | |
4059 | CDir *dir = in->get_or_open_dirfrag(in->mdcache, *p); | |
4060 | dir->scrub_info(); | |
4061 | if (!dir->scrub_infop->header) | |
4062 | dir->scrub_infop->header = in->scrub_infop->header; | |
4063 | if (dir->is_complete()) { | |
4064 | dir->scrub_local(); | |
4065 | } else { | |
4066 | dir->scrub_infop->need_scrub_local = true; | |
4067 | dir->fetch(gather.new_sub(), false); | |
4068 | } | |
4069 | } | |
4070 | if (gather.has_subs()) { | |
4071 | gather.set_finisher(get_internal_callback(DIRFRAGS)); | |
4072 | gather.activate(); | |
4073 | return false; | |
4074 | } else { | |
4075 | return immediate(DIRFRAGS, 0); | |
4076 | } | |
4077 | } | |
4078 | ||
4079 | bool _dirfrags(int rval) { | |
4080 | int frags_errors = 0; | |
4081 | // basic reporting setup | |
4082 | results->raw_stats.checked = true; | |
4083 | results->raw_stats.ondisk_read_retval = rval; | |
4084 | ||
4085 | results->raw_stats.memory_value.dirstat = in->inode.dirstat; | |
4086 | results->raw_stats.memory_value.rstat = in->inode.rstat; | |
4087 | frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat; | |
4088 | nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat; | |
4089 | ||
4090 | if (rval != 0) { | |
4091 | results->raw_stats.error_str << "Failed to read dirfrags off disk"; | |
4092 | goto next; | |
4093 | } | |
4094 | ||
4095 | // check each dirfrag... | |
4096 | for (compact_map<frag_t,CDir*>::iterator p = in->dirfrags.begin(); | |
4097 | p != in->dirfrags.end(); | |
4098 | ++p) { | |
4099 | CDir *dir = p->second; | |
4100 | assert(dir->get_version() > 0); | |
4101 | nest_info.add(dir->fnode.accounted_rstat); | |
4102 | dir_info.add(dir->fnode.accounted_fragstat); | |
4103 | if (dir->scrub_infop && | |
4104 | dir->scrub_infop->pending_scrub_error) { | |
4105 | dir->scrub_infop->pending_scrub_error = false; | |
4106 | if (dir->scrub_infop->header->get_repair()) { | |
b32b8144 | 4107 | results->raw_stats.repaired = true; |
7c673cae FG |
4108 | results->raw_stats.error_str |
4109 | << "dirfrag(" << p->first << ") has bad stats (will be fixed); "; | |
4110 | } else { | |
4111 | results->raw_stats.error_str | |
4112 | << "dirfrag(" << p->first << ") has bad stats; "; | |
4113 | } | |
4114 | frags_errors++; | |
4115 | } | |
4116 | } | |
4117 | nest_info.rsubdirs++; // it gets one to account for self | |
4118 | // ...and that their sum matches our inode settings | |
4119 | if (!dir_info.same_sums(in->inode.dirstat) || | |
4120 | !nest_info.same_sums(in->inode.rstat)) { | |
4121 | if (in->scrub_infop && | |
4122 | in->scrub_infop->header->get_repair()) { | |
4123 | results->raw_stats.error_str | |
4124 | << "freshly-calculated rstats don't match existing ones (will be fixed)"; | |
4125 | in->mdcache->repair_inode_stats(in); | |
b32b8144 | 4126 | results->raw_stats.repaired = true; |
7c673cae FG |
4127 | } else { |
4128 | results->raw_stats.error_str | |
4129 | << "freshly-calculated rstats don't match existing ones"; | |
4130 | } | |
4131 | goto next; | |
4132 | } | |
4133 | if (frags_errors > 0) | |
4134 | goto next; | |
4135 | ||
4136 | results->raw_stats.passed = true; | |
4137 | next: | |
4138 | return true; | |
4139 | } | |
4140 | ||
4141 | void _done() override { | |
4142 | if ((!results->raw_stats.checked || results->raw_stats.passed) && | |
4143 | (!results->backtrace.checked || results->backtrace.passed) && | |
4144 | (!results->inode.checked || results->inode.passed)) | |
4145 | results->passed_validation = true; | |
4146 | if (fin) { | |
4147 | fin->complete(get_rval()); | |
4148 | } | |
4149 | } | |
4150 | }; | |
4151 | ||
4152 | ||
4153 | dout(10) << "scrub starting validate_disk_state on " << *this << dendl; | |
4154 | ValidationContinuation *vc = new ValidationContinuation(this, | |
4155 | results, | |
4156 | fin); | |
4157 | vc->begin(); | |
4158 | } | |
4159 | ||
4160 | void CInode::validated_data::dump(Formatter *f) const | |
4161 | { | |
4162 | f->open_object_section("results"); | |
4163 | { | |
4164 | f->dump_bool("performed_validation", performed_validation); | |
4165 | f->dump_bool("passed_validation", passed_validation); | |
4166 | f->open_object_section("backtrace"); | |
4167 | { | |
4168 | f->dump_bool("checked", backtrace.checked); | |
4169 | f->dump_bool("passed", backtrace.passed); | |
4170 | f->dump_int("read_ret_val", backtrace.ondisk_read_retval); | |
4171 | f->dump_stream("ondisk_value") << backtrace.ondisk_value; | |
4172 | f->dump_stream("memoryvalue") << backtrace.memory_value; | |
4173 | f->dump_string("error_str", backtrace.error_str.str()); | |
4174 | } | |
4175 | f->close_section(); // backtrace | |
4176 | f->open_object_section("raw_stats"); | |
4177 | { | |
4178 | f->dump_bool("checked", raw_stats.checked); | |
4179 | f->dump_bool("passed", raw_stats.passed); | |
4180 | f->dump_int("read_ret_val", raw_stats.ondisk_read_retval); | |
4181 | f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat; | |
4182 | f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat; | |
4183 | f->dump_stream("memory_value.dirrstat") << raw_stats.memory_value.dirstat; | |
4184 | f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat; | |
4185 | f->dump_string("error_str", raw_stats.error_str.str()); | |
4186 | } | |
4187 | f->close_section(); // raw_stats | |
4188 | // dump failure return code | |
4189 | int rc = 0; | |
4190 | if (backtrace.checked && backtrace.ondisk_read_retval) | |
4191 | rc = backtrace.ondisk_read_retval; | |
4192 | if (inode.checked && inode.ondisk_read_retval) | |
4193 | rc = inode.ondisk_read_retval; | |
4194 | if (raw_stats.checked && raw_stats.ondisk_read_retval) | |
4195 | rc = raw_stats.ondisk_read_retval; | |
4196 | f->dump_int("return_code", rc); | |
4197 | } | |
4198 | f->close_section(); // results | |
4199 | } | |
4200 | ||
b32b8144 FG |
4201 | bool CInode::validated_data::all_damage_repaired() const |
4202 | { | |
4203 | bool unrepaired = | |
4204 | (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired) | |
4205 | || | |
4206 | (backtrace.checked && !backtrace.passed && !backtrace.repaired) | |
4207 | || | |
4208 | (inode.checked && !inode.passed && !inode.repaired); | |
4209 | ||
4210 | return !unrepaired; | |
4211 | } | |
4212 | ||
7c673cae FG |
4213 | void CInode::dump(Formatter *f) const |
4214 | { | |
4215 | InodeStoreBase::dump(f); | |
4216 | ||
4217 | MDSCacheObject::dump(f); | |
4218 | ||
4219 | f->open_object_section("versionlock"); | |
4220 | versionlock.dump(f); | |
4221 | f->close_section(); | |
4222 | ||
4223 | f->open_object_section("authlock"); | |
4224 | authlock.dump(f); | |
4225 | f->close_section(); | |
4226 | ||
4227 | f->open_object_section("linklock"); | |
4228 | linklock.dump(f); | |
4229 | f->close_section(); | |
4230 | ||
4231 | f->open_object_section("dirfragtreelock"); | |
4232 | dirfragtreelock.dump(f); | |
4233 | f->close_section(); | |
4234 | ||
4235 | f->open_object_section("filelock"); | |
4236 | filelock.dump(f); | |
4237 | f->close_section(); | |
4238 | ||
4239 | f->open_object_section("xattrlock"); | |
4240 | xattrlock.dump(f); | |
4241 | f->close_section(); | |
4242 | ||
4243 | f->open_object_section("snaplock"); | |
4244 | snaplock.dump(f); | |
4245 | f->close_section(); | |
4246 | ||
4247 | f->open_object_section("nestlock"); | |
4248 | nestlock.dump(f); | |
4249 | f->close_section(); | |
4250 | ||
4251 | f->open_object_section("flocklock"); | |
4252 | flocklock.dump(f); | |
4253 | f->close_section(); | |
4254 | ||
4255 | f->open_object_section("policylock"); | |
4256 | policylock.dump(f); | |
4257 | f->close_section(); | |
4258 | ||
4259 | f->open_array_section("states"); | |
4260 | MDSCacheObject::dump_states(f); | |
4261 | if (state_test(STATE_EXPORTING)) | |
4262 | f->dump_string("state", "exporting"); | |
4263 | if (state_test(STATE_OPENINGDIR)) | |
4264 | f->dump_string("state", "openingdir"); | |
4265 | if (state_test(STATE_FREEZING)) | |
4266 | f->dump_string("state", "freezing"); | |
4267 | if (state_test(STATE_FROZEN)) | |
4268 | f->dump_string("state", "frozen"); | |
4269 | if (state_test(STATE_AMBIGUOUSAUTH)) | |
4270 | f->dump_string("state", "ambiguousauth"); | |
4271 | if (state_test(STATE_EXPORTINGCAPS)) | |
4272 | f->dump_string("state", "exportingcaps"); | |
4273 | if (state_test(STATE_NEEDSRECOVER)) | |
4274 | f->dump_string("state", "needsrecover"); | |
4275 | if (state_test(STATE_PURGING)) | |
4276 | f->dump_string("state", "purging"); | |
4277 | if (state_test(STATE_DIRTYPARENT)) | |
4278 | f->dump_string("state", "dirtyparent"); | |
4279 | if (state_test(STATE_DIRTYRSTAT)) | |
4280 | f->dump_string("state", "dirtyrstat"); | |
4281 | if (state_test(STATE_STRAYPINNED)) | |
4282 | f->dump_string("state", "straypinned"); | |
4283 | if (state_test(STATE_FROZENAUTHPIN)) | |
4284 | f->dump_string("state", "frozenauthpin"); | |
4285 | if (state_test(STATE_DIRTYPOOL)) | |
4286 | f->dump_string("state", "dirtypool"); | |
4287 | if (state_test(STATE_ORPHAN)) | |
4288 | f->dump_string("state", "orphan"); | |
4289 | if (state_test(STATE_MISSINGOBJS)) | |
4290 | f->dump_string("state", "missingobjs"); | |
4291 | f->close_section(); | |
4292 | ||
4293 | f->open_array_section("client_caps"); | |
4294 | for (map<client_t,Capability*>::const_iterator it = client_caps.begin(); | |
4295 | it != client_caps.end(); ++it) { | |
4296 | f->open_object_section("client_cap"); | |
4297 | f->dump_int("client_id", it->first.v); | |
4298 | f->dump_string("pending", ccap_string(it->second->pending())); | |
4299 | f->dump_string("issued", ccap_string(it->second->issued())); | |
4300 | f->dump_string("wanted", ccap_string(it->second->wanted())); | |
b32b8144 | 4301 | f->dump_int("last_sent", it->second->get_last_sent()); |
7c673cae FG |
4302 | f->close_section(); |
4303 | } | |
4304 | f->close_section(); | |
4305 | ||
4306 | f->dump_int("loner", loner_cap.v); | |
4307 | f->dump_int("want_loner", want_loner_cap.v); | |
4308 | ||
4309 | f->open_array_section("mds_caps_wanted"); | |
4310 | for (compact_map<int,int>::const_iterator p = mds_caps_wanted.begin(); | |
4311 | p != mds_caps_wanted.end(); ++p) { | |
4312 | f->open_object_section("mds_cap_wanted"); | |
4313 | f->dump_int("rank", p->first); | |
4314 | f->dump_string("cap", ccap_string(p->second)); | |
4315 | f->close_section(); | |
4316 | } | |
4317 | f->close_section(); | |
4318 | } | |
4319 | ||
4320 | /****** Scrub Stuff *****/ | |
4321 | void CInode::scrub_info_create() const | |
4322 | { | |
4323 | dout(25) << __func__ << dendl; | |
4324 | assert(!scrub_infop); | |
4325 | ||
4326 | // break out of const-land to set up implicit initial state | |
4327 | CInode *me = const_cast<CInode*>(this); | |
4328 | inode_t *in = me->get_projected_inode(); | |
4329 | ||
4330 | scrub_info_t *si = new scrub_info_t(); | |
4331 | si->scrub_start_stamp = si->last_scrub_stamp = in->last_scrub_stamp; | |
4332 | si->scrub_start_version = si->last_scrub_version = in->last_scrub_version; | |
4333 | ||
4334 | me->scrub_infop = si; | |
4335 | } | |
4336 | ||
4337 | void CInode::scrub_maybe_delete_info() | |
4338 | { | |
4339 | if (scrub_infop && | |
4340 | !scrub_infop->scrub_in_progress && | |
4341 | !scrub_infop->last_scrub_dirty) { | |
4342 | delete scrub_infop; | |
4343 | scrub_infop = NULL; | |
4344 | } | |
4345 | } | |
4346 | ||
4347 | void CInode::scrub_initialize(CDentry *scrub_parent, | |
b32b8144 | 4348 | ScrubHeaderRef& header, |
7c673cae FG |
4349 | MDSInternalContextBase *f) |
4350 | { | |
4351 | dout(20) << __func__ << " with scrub_version " << get_version() << dendl; | |
4352 | assert(!scrub_is_in_progress()); | |
4353 | scrub_info(); | |
4354 | if (!scrub_infop) | |
4355 | scrub_infop = new scrub_info_t(); | |
4356 | ||
4357 | if (get_projected_inode()->is_dir()) { | |
4358 | // fill in dirfrag_stamps with initial state | |
4359 | std::list<frag_t> frags; | |
4360 | dirfragtree.get_leaves(frags); | |
4361 | for (std::list<frag_t>::iterator i = frags.begin(); | |
4362 | i != frags.end(); | |
4363 | ++i) { | |
4364 | if (header->get_force()) | |
4365 | scrub_infop->dirfrag_stamps[*i].reset(); | |
4366 | else | |
4367 | scrub_infop->dirfrag_stamps[*i]; | |
4368 | } | |
4369 | } | |
4370 | ||
4371 | if (scrub_parent) | |
4372 | scrub_parent->get(CDentry::PIN_SCRUBPARENT); | |
4373 | scrub_infop->scrub_parent = scrub_parent; | |
4374 | scrub_infop->on_finish = f; | |
4375 | scrub_infop->scrub_in_progress = true; | |
4376 | scrub_infop->children_scrubbed = false; | |
4377 | scrub_infop->header = header; | |
4378 | ||
4379 | scrub_infop->scrub_start_version = get_version(); | |
4380 | scrub_infop->scrub_start_stamp = ceph_clock_now(); | |
4381 | // right now we don't handle remote inodes | |
4382 | } | |
4383 | ||
4384 | int CInode::scrub_dirfrag_next(frag_t* out_dirfrag) | |
4385 | { | |
4386 | dout(20) << __func__ << dendl; | |
4387 | assert(scrub_is_in_progress()); | |
4388 | ||
4389 | if (!is_dir()) { | |
4390 | return -ENOTDIR; | |
4391 | } | |
4392 | ||
4393 | std::map<frag_t, scrub_stamp_info_t>::iterator i = | |
4394 | scrub_infop->dirfrag_stamps.begin(); | |
4395 | ||
4396 | while (i != scrub_infop->dirfrag_stamps.end()) { | |
4397 | if (i->second.scrub_start_version < scrub_infop->scrub_start_version) { | |
4398 | i->second.scrub_start_version = get_projected_version(); | |
4399 | i->second.scrub_start_stamp = ceph_clock_now(); | |
4400 | *out_dirfrag = i->first; | |
4401 | dout(20) << " return frag " << *out_dirfrag << dendl; | |
4402 | return 0; | |
4403 | } | |
4404 | ++i; | |
4405 | } | |
4406 | ||
4407 | dout(20) << " no frags left, ENOENT " << dendl; | |
4408 | return ENOENT; | |
4409 | } | |
4410 | ||
4411 | void CInode::scrub_dirfrags_scrubbing(list<frag_t>* out_dirfrags) | |
4412 | { | |
4413 | assert(out_dirfrags != NULL); | |
4414 | assert(scrub_infop != NULL); | |
4415 | ||
4416 | out_dirfrags->clear(); | |
4417 | std::map<frag_t, scrub_stamp_info_t>::iterator i = | |
4418 | scrub_infop->dirfrag_stamps.begin(); | |
4419 | ||
4420 | while (i != scrub_infop->dirfrag_stamps.end()) { | |
4421 | if (i->second.scrub_start_version >= scrub_infop->scrub_start_version) { | |
4422 | if (i->second.last_scrub_version < scrub_infop->scrub_start_version) | |
4423 | out_dirfrags->push_back(i->first); | |
4424 | } else { | |
4425 | return; | |
4426 | } | |
4427 | ||
4428 | ++i; | |
4429 | } | |
4430 | } | |
4431 | ||
4432 | void CInode::scrub_dirfrag_finished(frag_t dirfrag) | |
4433 | { | |
4434 | dout(20) << __func__ << " on frag " << dirfrag << dendl; | |
4435 | assert(scrub_is_in_progress()); | |
4436 | ||
4437 | std::map<frag_t, scrub_stamp_info_t>::iterator i = | |
4438 | scrub_infop->dirfrag_stamps.find(dirfrag); | |
4439 | assert(i != scrub_infop->dirfrag_stamps.end()); | |
4440 | ||
4441 | scrub_stamp_info_t &si = i->second; | |
4442 | si.last_scrub_stamp = si.scrub_start_stamp; | |
4443 | si.last_scrub_version = si.scrub_start_version; | |
4444 | } | |
4445 | ||
4446 | void CInode::scrub_finished(MDSInternalContextBase **c) { | |
4447 | dout(20) << __func__ << dendl; | |
4448 | assert(scrub_is_in_progress()); | |
4449 | for (std::map<frag_t, scrub_stamp_info_t>::iterator i = | |
4450 | scrub_infop->dirfrag_stamps.begin(); | |
4451 | i != scrub_infop->dirfrag_stamps.end(); | |
4452 | ++i) { | |
4453 | if(i->second.last_scrub_version != i->second.scrub_start_version) { | |
4454 | derr << i->second.last_scrub_version << " != " | |
4455 | << i->second.scrub_start_version << dendl; | |
4456 | } | |
4457 | assert(i->second.last_scrub_version == i->second.scrub_start_version); | |
4458 | } | |
4459 | ||
4460 | scrub_infop->last_scrub_version = scrub_infop->scrub_start_version; | |
4461 | scrub_infop->last_scrub_stamp = scrub_infop->scrub_start_stamp; | |
4462 | scrub_infop->last_scrub_dirty = true; | |
4463 | scrub_infop->scrub_in_progress = false; | |
4464 | ||
4465 | if (scrub_infop->scrub_parent) { | |
4466 | CDentry *dn = scrub_infop->scrub_parent; | |
4467 | scrub_infop->scrub_parent = NULL; | |
4468 | dn->dir->scrub_dentry_finished(dn); | |
4469 | dn->put(CDentry::PIN_SCRUBPARENT); | |
4470 | } | |
4471 | ||
4472 | *c = scrub_infop->on_finish; | |
4473 | scrub_infop->on_finish = NULL; | |
4474 | ||
4475 | if (scrub_infop->header->get_origin() == this) { | |
4476 | // We are at the point that a tagging scrub was initiated | |
4477 | LogChannelRef clog = mdcache->mds->clog; | |
b32b8144 FG |
4478 | if (scrub_infop->header->get_tag().empty()) { |
4479 | clog->info() << "scrub complete"; | |
4480 | } else { | |
4481 | clog->info() << "scrub complete with tag '" | |
4482 | << scrub_infop->header->get_tag() << "'"; | |
4483 | } | |
7c673cae FG |
4484 | } |
4485 | } | |
4486 | ||
4487 | int64_t CInode::get_backtrace_pool() const | |
4488 | { | |
4489 | if (is_dir()) { | |
4490 | return mdcache->mds->mdsmap->get_metadata_pool(); | |
4491 | } else { | |
4492 | // Files are required to have an explicit layout that specifies | |
4493 | // a pool | |
4494 | assert(inode.layout.pool_id != -1); | |
4495 | return inode.layout.pool_id; | |
4496 | } | |
4497 | } | |
4498 | ||
31f18b77 FG |
4499 | void CInode::maybe_export_pin(bool update) |
4500 | { | |
4501 | if (!g_conf->mds_bal_export_pin) | |
4502 | return; | |
4503 | if (!is_dir() || !is_normal()) | |
4504 | return; | |
7c673cae | 4505 | |
31f18b77 FG |
4506 | mds_rank_t export_pin = get_export_pin(false); |
4507 | if (export_pin == MDS_RANK_NONE && !update) | |
4508 | return; | |
7c673cae | 4509 | |
31f18b77 FG |
4510 | if (state_test(CInode::STATE_QUEUEDEXPORTPIN)) |
4511 | return; | |
4512 | ||
4513 | bool queue = false; | |
4514 | for (auto p = dirfrags.begin(); p != dirfrags.end(); p++) { | |
4515 | CDir *dir = p->second; | |
4516 | if (!dir->is_auth()) | |
4517 | continue; | |
4518 | if (export_pin != MDS_RANK_NONE) { | |
4519 | if (dir->is_subtree_root()) { | |
4520 | // set auxsubtree bit or export it | |
4521 | if (!dir->state_test(CDir::STATE_AUXSUBTREE) || | |
4522 | export_pin != dir->get_dir_auth().first) | |
4523 | queue = true; | |
4524 | } else { | |
4525 | // create aux subtree or export it | |
4526 | queue = true; | |
7c673cae | 4527 | } |
31f18b77 FG |
4528 | } else { |
4529 | // clear aux subtrees ? | |
4530 | queue = dir->state_test(CDir::STATE_AUXSUBTREE); | |
4531 | } | |
4532 | if (queue) { | |
4533 | state_set(CInode::STATE_QUEUEDEXPORTPIN); | |
7c673cae | 4534 | mdcache->export_pin_queue.insert(this); |
31f18b77 | 4535 | break; |
7c673cae FG |
4536 | } |
4537 | } | |
4538 | } | |
4539 | ||
4540 | void CInode::set_export_pin(mds_rank_t rank) | |
4541 | { | |
4542 | assert(is_dir()); | |
4543 | assert(is_projected()); | |
4544 | get_projected_inode()->export_pin = rank; | |
31f18b77 | 4545 | maybe_export_pin(true); |
7c673cae FG |
4546 | } |
4547 | ||
4548 | mds_rank_t CInode::get_export_pin(bool inherit) const | |
4549 | { | |
4550 | /* An inode that is export pinned may not necessarily be a subtree root, we | |
4551 | * need to traverse the parents. A base or system inode cannot be pinned. | |
4552 | * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not | |
4553 | * have a parent yet. | |
4554 | */ | |
b32b8144 FG |
4555 | const CInode *in = this; |
4556 | while (true) { | |
4557 | if (in->is_system()) | |
4558 | break; | |
4559 | const CDentry *pdn = in->get_projected_parent_dn(); | |
4560 | if (!pdn) | |
4561 | break; | |
4562 | const inode_t *pi = in->get_projected_inode(); | |
4563 | // ignore export pin for unlinked directory | |
4564 | if (pi->nlink == 0) | |
4565 | break; | |
4566 | if (pi->export_pin >= 0) | |
4567 | return pi->export_pin; | |
4568 | ||
4569 | if (!inherit) | |
4570 | break; | |
4571 | in = pdn->get_dir()->inode; | |
7c673cae FG |
4572 | } |
4573 | return MDS_RANK_NONE; | |
4574 | } | |
4575 | ||
4576 | bool CInode::is_exportable(mds_rank_t dest) const | |
4577 | { | |
4578 | mds_rank_t pin = get_export_pin(); | |
4579 | if (pin == dest) { | |
4580 | return true; | |
4581 | } else if (pin >= 0) { | |
4582 | return false; | |
4583 | } else { | |
4584 | return true; | |
4585 | } | |
4586 | } | |
181888fb FG |
4587 | |
4588 | MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co); |