1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include "SessionMap.h"
19 #include "osdc/Filer.h"
20 #include "common/Finisher.h"
22 #include "common/config.h"
23 #include "common/errno.h"
24 #include "common/DecayCounter.h"
25 #include "include/ceph_assert.h"
26 #include "include/stringify.h"
28 #define dout_context g_ceph_context
29 #define dout_subsys ceph_subsys_mds
31 #define dout_prefix *_dout << "mds." << rank << ".sessionmap "
34 class SessionMapIOContext
: public MDSIOContextBase
37 SessionMap
*sessionmap
;
38 MDSRank
*get_mds() override
{return sessionmap
->mds
;}
40 explicit SessionMapIOContext(SessionMap
*sessionmap_
) : sessionmap(sessionmap_
) {
41 ceph_assert(sessionmap
!= NULL
);
46 void SessionMap::register_perfcounters()
48 PerfCountersBuilder
plb(g_ceph_context
, "mds_sessions",
49 l_mdssm_first
, l_mdssm_last
);
51 plb
.add_u64(l_mdssm_session_count
, "session_count",
52 "Session count", "sess", PerfCountersBuilder::PRIO_INTERESTING
);
54 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
55 plb
.add_u64_counter(l_mdssm_session_add
, "session_add",
57 plb
.add_u64_counter(l_mdssm_session_remove
, "session_remove",
59 plb
.add_u64(l_mdssm_session_open
, "sessions_open",
60 "Sessions currently open");
61 plb
.add_u64(l_mdssm_session_stale
, "sessions_stale",
62 "Sessions currently stale");
63 plb
.add_u64(l_mdssm_total_load
, "total_load", "Total Load");
64 plb
.add_u64(l_mdssm_avg_load
, "average_load", "Average Load");
65 plb
.add_u64(l_mdssm_avg_session_uptime
, "avg_session_uptime",
66 "Average session uptime");
68 logger
= plb
.create_perf_counters();
69 g_ceph_context
->get_perfcounters_collection()->add(logger
);
72 void SessionMap::dump()
74 dout(10) << "dump" << dendl
;
75 for (ceph::unordered_map
<entity_name_t
,Session
*>::iterator p
= session_map
.begin();
76 p
!= session_map
.end();
78 dout(10) << p
->first
<< " " << p
->second
79 << " state " << p
->second
->get_state_name()
80 << " completed " << p
->second
->info
.completed_requests
81 << " free_prealloc_inos " << p
->second
->free_prealloc_inos
82 << " delegated_inos " << p
->second
->delegated_inos
91 object_t
SessionMap::get_object_name() const
94 snprintf(s
, sizeof(s
), "mds%d_sessionmap", int(mds
->get_nodeid()));
99 class C_IO_SM_Load
: public SessionMapIOContext
{
101 const bool first
; //< Am I the initial (header) load?
102 int header_r
; //< Return value from OMAP header read
103 int values_r
; //< Return value from OMAP value read
104 bufferlist header_bl
;
105 std::map
<std::string
, bufferlist
> session_vals
;
106 bool more_session_vals
= false;
108 C_IO_SM_Load(SessionMap
*cm
, const bool f
)
109 : SessionMapIOContext(cm
), first(f
), header_r(0), values_r(0) {}
111 void finish(int r
) override
{
112 sessionmap
->_load_finish(r
, header_r
, values_r
, first
, header_bl
, session_vals
,
115 void print(ostream
& out
) const override
{
116 out
<< "session_load";
123 * Decode OMAP header. Call this once when loading.
125 void SessionMapStore::decode_header(
126 bufferlist
&header_bl
)
128 auto q
= header_bl
.cbegin();
134 void SessionMapStore::encode_header(
135 bufferlist
*header_bl
)
137 ENCODE_START(1, 1, *header_bl
);
138 encode(version
, *header_bl
);
139 ENCODE_FINISH(*header_bl
);
143 * Decode and insert some serialized OMAP values. Call this
144 * repeatedly to insert batched loads.
146 void SessionMapStore::decode_values(std::map
<std::string
, bufferlist
> &session_vals
)
148 for (std::map
<std::string
, bufferlist
>::iterator i
= session_vals
.begin();
149 i
!= session_vals
.end(); ++i
) {
153 bool parsed
= inst
.name
.parse(i
->first
);
155 derr
<< "Corrupt entity name '" << i
->first
<< "' in sessionmap" << dendl
;
156 throw buffer::malformed_input("Corrupt entity name in sessionmap");
159 Session
*s
= get_or_add_session(inst
);
160 if (s
->is_closed()) {
161 s
->set_state(Session::STATE_OPEN
);
162 s
->set_load_avg_decay_rate(decay_rate
);
164 auto q
= i
->second
.cbegin();
170 * An OMAP read finished.
172 void SessionMap::_load_finish(
177 bufferlist
&header_bl
,
178 std::map
<std::string
, bufferlist
> &session_vals
,
179 bool more_session_vals
)
181 if (operation_r
< 0) {
182 derr
<< "_load_finish got " << cpp_strerror(operation_r
) << dendl
;
183 mds
->clog
->error() << "error reading sessionmap '" << get_object_name()
184 << "' " << operation_r
<< " ("
185 << cpp_strerror(operation_r
) << ")";
187 ceph_abort(); // Should be unreachable because damaged() calls respawn()
193 derr
<< __func__
<< ": header error: " << cpp_strerror(header_r
) << dendl
;
194 mds
->clog
->error() << "error reading sessionmap header "
195 << header_r
<< " (" << cpp_strerror(header_r
) << ")";
197 ceph_abort(); // Should be unreachable because damaged() calls respawn()
200 if(header_bl
.length() == 0) {
201 dout(4) << __func__
<< ": header missing, loading legacy..." << dendl
;
207 decode_header(header_bl
);
208 } catch (buffer::error
&e
) {
209 mds
->clog
->error() << "corrupt sessionmap header: " << e
.what();
211 ceph_abort(); // Should be unreachable because damaged() calls respawn()
213 dout(10) << __func__
<< " loaded version " << version
<< dendl
;
217 derr
<< __func__
<< ": error reading values: "
218 << cpp_strerror(values_r
) << dendl
;
219 mds
->clog
->error() << "error reading sessionmap values: "
220 << values_r
<< " (" << cpp_strerror(values_r
) << ")";
222 ceph_abort(); // Should be unreachable because damaged() calls respawn()
225 // Decode session_vals
227 decode_values(session_vals
);
228 } catch (buffer::error
&e
) {
229 mds
->clog
->error() << "corrupt sessionmap values: " << e
.what();
231 ceph_abort(); // Should be unreachable because damaged() calls respawn()
234 if (more_session_vals
) {
235 // Issue another read if we're not at the end of the omap
236 const std::string last_key
= session_vals
.rbegin()->first
;
237 dout(10) << __func__
<< ": continue omap load from '"
238 << last_key
<< "'" << dendl
;
239 object_t oid
= get_object_name();
240 object_locator_t
oloc(mds
->mdsmap
->get_metadata_pool());
241 C_IO_SM_Load
*c
= new C_IO_SM_Load(this, false);
243 op
.omap_get_vals(last_key
, "", g_conf()->mds_sessionmap_keys_per_op
,
244 &c
->session_vals
, &c
->more_session_vals
, &c
->values_r
);
245 mds
->objecter
->read(oid
, oloc
, op
, CEPH_NOSNAP
, NULL
, 0,
246 new C_OnFinisher(c
, mds
->finisher
));
248 // I/O is complete. Update `by_state`
249 dout(10) << __func__
<< ": omap load complete" << dendl
;
250 for (ceph::unordered_map
<entity_name_t
, Session
*>::iterator i
= session_map
.begin();
251 i
!= session_map
.end(); ++i
) {
252 Session
*s
= i
->second
;
253 auto by_state_entry
= by_state
.find(s
->get_state());
254 if (by_state_entry
== by_state
.end())
255 by_state_entry
= by_state
.emplace(s
->get_state(),
256 new xlist
<Session
*>).first
;
257 by_state_entry
->second
->push_back(&s
->item_session_list
);
260 // Population is complete. Trigger load waiters.
261 dout(10) << __func__
<< ": v " << version
262 << ", " << session_map
.size() << " sessions" << dendl
;
263 projected
= committing
= committed
= version
;
265 finish_contexts(g_ceph_context
, waiting_for_load
);
270 * Populate session state from OMAP records in this
271 * rank's sessionmap object.
273 void SessionMap::load(MDSContext
*onload
)
275 dout(10) << "load" << dendl
;
278 waiting_for_load
.push_back(onload
);
280 C_IO_SM_Load
*c
= new C_IO_SM_Load(this, true);
281 object_t oid
= get_object_name();
282 object_locator_t
oloc(mds
->mdsmap
->get_metadata_pool());
285 op
.omap_get_header(&c
->header_bl
, &c
->header_r
);
286 op
.omap_get_vals("", "", g_conf()->mds_sessionmap_keys_per_op
,
287 &c
->session_vals
, &c
->more_session_vals
, &c
->values_r
);
289 mds
->objecter
->read(oid
, oloc
, op
, CEPH_NOSNAP
, NULL
, 0, new C_OnFinisher(c
, mds
->finisher
));
293 class C_IO_SM_LoadLegacy
: public SessionMapIOContext
{
296 explicit C_IO_SM_LoadLegacy(SessionMap
*cm
) : SessionMapIOContext(cm
) {}
297 void finish(int r
) override
{
298 sessionmap
->_load_legacy_finish(r
, bl
);
300 void print(ostream
& out
) const override
{
301 out
<< "session_load_legacy";
308 * Load legacy (object data blob) SessionMap format, assuming
309 * that waiting_for_load has already been populated with
310 * the relevant completion. This is the fallback if we do not
311 * find an OMAP header when attempting to load normally.
313 void SessionMap::load_legacy()
315 dout(10) << __func__
<< dendl
;
317 C_IO_SM_LoadLegacy
*c
= new C_IO_SM_LoadLegacy(this);
318 object_t oid
= get_object_name();
319 object_locator_t
oloc(mds
->mdsmap
->get_metadata_pool());
321 mds
->objecter
->read_full(oid
, oloc
, CEPH_NOSNAP
, &c
->bl
, 0,
322 new C_OnFinisher(c
, mds
->finisher
));
325 void SessionMap::_load_legacy_finish(int r
, bufferlist
&bl
)
327 auto blp
= bl
.cbegin();
329 derr
<< "_load_finish got " << cpp_strerror(r
) << dendl
;
330 ceph_abort_msg("failed to load sessionmap");
333 decode_legacy(blp
); // note: this sets last_cap_renew = now()
334 dout(10) << "_load_finish v " << version
335 << ", " << session_map
.size() << " sessions, "
336 << bl
.length() << " bytes"
338 projected
= committing
= committed
= version
;
341 // Mark all sessions dirty, so that on next save() we will write
342 // a complete OMAP version of the data loaded from the legacy format
343 for (ceph::unordered_map
<entity_name_t
, Session
*>::iterator i
= session_map
.begin();
344 i
!= session_map
.end(); ++i
) {
345 // Don't use mark_dirty because on this occasion we want to ignore the
346 // keys_per_op limit and do one big write (upgrade must be atomic)
347 dirty_sessions
.insert(i
->first
);
349 loaded_legacy
= true;
351 finish_contexts(g_ceph_context
, waiting_for_load
);
359 class C_IO_SM_Save
: public SessionMapIOContext
{
362 C_IO_SM_Save(SessionMap
*cm
, version_t v
) : SessionMapIOContext(cm
), version(v
) {}
363 void finish(int r
) override
{
365 get_mds()->handle_write_error(r
);
367 sessionmap
->_save_finish(version
);
370 void print(ostream
& out
) const override
{
371 out
<< "session_save";
376 void SessionMap::save(MDSContext
*onsave
, version_t needv
)
378 dout(10) << __func__
<< ": needv " << needv
<< ", v " << version
<< dendl
;
380 if (needv
&& committing
>= needv
) {
381 ceph_assert(committing
> committed
);
382 commit_waiters
[committing
].push_back(onsave
);
386 commit_waiters
[version
].push_back(onsave
);
388 committing
= version
;
390 object_t oid
= get_object_name();
391 object_locator_t
oloc(mds
->mdsmap
->get_metadata_pool());
395 /* Compose OSD OMAP transaction for full write */
396 bufferlist header_bl
;
397 encode_header(&header_bl
);
398 op
.omap_set_header(header_bl
);
400 /* If we loaded a legacy sessionmap, then erase the old data. If
401 * an old-versioned MDS tries to read it, it'll fail out safely
402 * with an end_of_buffer exception */
404 dout(4) << __func__
<< " erasing legacy sessionmap" << dendl
;
406 loaded_legacy
= false; // only need to truncate once.
409 dout(20) << " updating keys:" << dendl
;
410 map
<string
, bufferlist
> to_set
;
411 for(std::set
<entity_name_t
>::iterator i
= dirty_sessions
.begin();
412 i
!= dirty_sessions
.end(); ++i
) {
413 const entity_name_t name
= *i
;
414 Session
*session
= session_map
[name
];
416 if (session
->is_open() ||
417 session
->is_closing() ||
418 session
->is_stale() ||
419 session
->is_killing()) {
420 dout(20) << " " << name
<< dendl
;
422 CachedStackStringStream css
;
427 session
->info
.encode(bl
, mds
->mdsmap
->get_up_features());
430 to_set
[std::string(css
->strv())] = bl
;
432 session
->clear_dirty_completed_requests();
434 dout(20) << " " << name
<< " (ignoring)" << dendl
;
437 if (!to_set
.empty()) {
441 dout(20) << " removing keys:" << dendl
;
442 set
<string
> to_remove
;
443 for(std::set
<entity_name_t
>::const_iterator i
= null_sessions
.begin();
444 i
!= null_sessions
.end(); ++i
) {
445 dout(20) << " " << *i
<< dendl
;
446 CachedStackStringStream css
;
448 to_remove
.insert(css
->str());
450 if (!to_remove
.empty()) {
451 op
.omap_rm_keys(to_remove
);
454 dirty_sessions
.clear();
455 null_sessions
.clear();
457 mds
->objecter
->mutate(oid
, oloc
, op
, snapc
,
458 ceph::real_clock::now(),
460 new C_OnFinisher(new C_IO_SM_Save(this, version
),
464 void SessionMap::_save_finish(version_t v
)
466 dout(10) << "_save_finish v" << v
<< dendl
;
469 finish_contexts(g_ceph_context
, commit_waiters
[v
]);
470 commit_waiters
.erase(v
);
475 * Deserialize sessions, and update by_state index
477 void SessionMap::decode_legacy(bufferlist::const_iterator
&p
)
479 // Populate `sessions`
480 SessionMapStore::decode_legacy(p
);
483 for (ceph::unordered_map
<entity_name_t
, Session
*>::iterator i
= session_map
.begin();
484 i
!= session_map
.end(); ++i
) {
485 Session
*s
= i
->second
;
486 auto by_state_entry
= by_state
.find(s
->get_state());
487 if (by_state_entry
== by_state
.end())
488 by_state_entry
= by_state
.emplace(s
->get_state(),
489 new xlist
<Session
*>).first
;
490 by_state_entry
->second
->push_back(&s
->item_session_list
);
494 uint64_t SessionMap::set_state(Session
*session
, int s
) {
495 if (session
->state
!= s
) {
496 session
->set_state(s
);
497 auto by_state_entry
= by_state
.find(s
);
498 if (by_state_entry
== by_state
.end())
499 by_state_entry
= by_state
.emplace(s
, new xlist
<Session
*>).first
;
500 by_state_entry
->second
->push_back(&session
->item_session_list
);
502 if (session
->is_open() || session
->is_stale()) {
503 session
->set_load_avg_decay_rate(decay_rate
);
506 // refresh number of sessions for states which have perf
507 // couters associated
508 logger
->set(l_mdssm_session_open
,
509 get_session_count_in_state(Session::STATE_OPEN
));
510 logger
->set(l_mdssm_session_stale
,
511 get_session_count_in_state(Session::STATE_STALE
));
514 return session
->get_state_seq();
517 void SessionMapStore::decode_legacy(bufferlist::const_iterator
& p
)
519 auto now
= clock::now();
522 if (pre
== (uint64_t)-1) {
523 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p
);
524 ceph_assert(struct_v
>= 2);
530 decode(inst
.name
, p
);
531 Session
*s
= get_or_add_session(inst
);
532 if (s
->is_closed()) {
533 s
->set_state(Session::STATE_OPEN
);
534 s
->set_load_avg_decay_rate(decay_rate
);
541 // --- old format ----
544 // this is a meaningless upper bound. can be ignored.
548 while (n
-- && !p
.end()) {
550 Session
*s
= new Session(ConnectionRef());
553 auto& name
= s
->info
.inst
.name
;
554 auto it
= session_map
.find(name
);
555 if (it
!= session_map
.end()) {
556 // eager client connected too fast! aie.
557 dout(10) << " already had session for " << name
<< ", recovering" << dendl
;
566 s
->set_state(Session::STATE_OPEN
);
567 s
->set_load_avg_decay_rate(decay_rate
);
568 s
->last_cap_renew
= now
;
573 void Session::dump(Formatter
*f
, bool cap_dump
) const
575 f
->dump_int("id", info
.inst
.name
.num());
576 f
->dump_object("entity", info
.inst
);
577 f
->dump_string("state", get_state_name());
578 f
->dump_int("num_leases", leases
.size());
579 f
->dump_int("num_caps", caps
.size());
581 f
->open_array_section("caps");
582 for (const auto& cap
: caps
) {
583 f
->dump_object("cap", *cap
);
587 if (is_open() || is_stale()) {
588 f
->dump_unsigned("request_load_avg", get_load_avg());
590 f
->dump_float("uptime", get_session_uptime());
591 f
->dump_unsigned("requests_in_flight", get_request_count());
592 f
->dump_unsigned("completed_requests", get_num_completed_requests());
593 f
->dump_bool("reconnecting", reconnecting
);
594 f
->dump_object("recall_caps", recall_caps
);
595 f
->dump_object("release_caps", release_caps
);
596 f
->dump_object("recall_caps_throttle", recall_caps_throttle
);
597 f
->dump_object("recall_caps_throttle2o", recall_caps_throttle2o
);
598 f
->dump_object("session_cache_liveness", session_cache_liveness
);
599 f
->dump_object("cap_acquisition", cap_acquisition
);
601 f
->open_array_section("delegated_inos");
602 for (const auto& [start
, len
] : delegated_inos
) {
603 f
->open_object_section("ino_range");
604 f
->dump_stream("start") << start
;
605 f
->dump_unsigned("length", len
);
613 void SessionMapStore::dump(Formatter
*f
) const
615 f
->open_array_section("sessions");
616 for (const auto& p
: session_map
) {
617 f
->dump_object("session", *p
.second
);
619 f
->close_section(); // Sessions
622 void SessionMapStore::generate_test_instances(std::list
<SessionMapStore
*>& ls
)
624 // pretty boring for now
625 ls
.push_back(new SessionMapStore());
628 void SessionMap::wipe()
630 dout(1) << "wipe start" << dendl
;
632 while (!session_map
.empty()) {
633 Session
*s
= session_map
.begin()->second
;
636 version
= ++projected
;
637 dout(1) << "wipe result" << dendl
;
639 dout(1) << "wipe done" << dendl
;
642 void SessionMap::wipe_ino_prealloc()
644 for (ceph::unordered_map
<entity_name_t
,Session
*>::iterator p
= session_map
.begin();
645 p
!= session_map
.end();
647 p
->second
->pending_prealloc_inos
.clear();
648 p
->second
->free_prealloc_inos
.clear();
649 p
->second
->delegated_inos
.clear();
650 p
->second
->info
.prealloc_inos
.clear();
652 projected
= ++version
;
655 void SessionMap::add_session(Session
*s
)
657 dout(10) << __func__
<< " s=" << s
<< " name=" << s
->info
.inst
.name
<< dendl
;
659 ceph_assert(session_map
.count(s
->info
.inst
.name
) == 0);
660 session_map
[s
->info
.inst
.name
] = s
;
661 auto by_state_entry
= by_state
.find(s
->state
);
662 if (by_state_entry
== by_state
.end())
663 by_state_entry
= by_state
.emplace(s
->state
, new xlist
<Session
*>).first
;
664 by_state_entry
->second
->push_back(&s
->item_session_list
);
667 update_average_birth_time(*s
);
669 logger
->set(l_mdssm_session_count
, session_map
.size());
670 logger
->inc(l_mdssm_session_add
);
673 void SessionMap::remove_session(Session
*s
)
675 dout(10) << __func__
<< " s=" << s
<< " name=" << s
->info
.inst
.name
<< dendl
;
677 update_average_birth_time(*s
, false);
679 s
->trim_completed_requests(0);
680 s
->item_session_list
.remove_myself();
681 session_map
.erase(s
->info
.inst
.name
);
682 dirty_sessions
.erase(s
->info
.inst
.name
);
683 null_sessions
.insert(s
->info
.inst
.name
);
686 logger
->set(l_mdssm_session_count
, session_map
.size());
687 logger
->inc(l_mdssm_session_remove
);
690 void SessionMap::touch_session(Session
*session
)
692 dout(10) << __func__
<< " s=" << session
<< " name=" << session
->info
.inst
.name
<< dendl
;
694 // Move to the back of the session list for this state (should
695 // already be on a list courtesy of add_session and set_state)
696 ceph_assert(session
->item_session_list
.is_on_list());
697 auto by_state_entry
= by_state
.find(session
->state
);
698 if (by_state_entry
== by_state
.end())
699 by_state_entry
= by_state
.emplace(session
->state
,
700 new xlist
<Session
*>).first
;
701 by_state_entry
->second
->push_back(&session
->item_session_list
);
703 session
->last_cap_renew
= clock::now();
706 void SessionMap::_mark_dirty(Session
*s
, bool may_save
)
708 if (dirty_sessions
.count(s
->info
.inst
.name
))
712 dirty_sessions
.size() >= g_conf()->mds_sessionmap_keys_per_op
) {
713 // Pre-empt the usual save() call from journal segment trim, in
714 // order to avoid building up an oversized OMAP update operation
715 // from too many sessions modified at once
716 save(new C_MDSInternalNoop
, version
);
719 null_sessions
.erase(s
->info
.inst
.name
);
720 dirty_sessions
.insert(s
->info
.inst
.name
);
723 void SessionMap::mark_dirty(Session
*s
, bool may_save
)
725 dout(20) << __func__
<< " s=" << s
<< " name=" << s
->info
.inst
.name
726 << " v=" << version
<< dendl
;
728 _mark_dirty(s
, may_save
);
733 void SessionMap::replay_dirty_session(Session
*s
)
735 dout(20) << __func__
<< " s=" << s
<< " name=" << s
->info
.inst
.name
736 << " v=" << version
<< dendl
;
738 _mark_dirty(s
, false);
740 replay_advance_version();
743 void SessionMap::replay_advance_version()
749 void SessionMap::replay_open_sessions(version_t event_cmapv
,
750 map
<client_t
,entity_inst_t
>& client_map
,
751 map
<client_t
,client_metadata_t
>& client_metadata_map
)
753 unsigned already_saved
;
755 if (version
+ client_map
.size() < event_cmapv
)
758 // Server::finish_force_open_sessions() marks sessions dirty one by one.
759 // Marking a session dirty may flush all existing dirty sessions. So it's
760 // possible that some sessions are already saved in sessionmap.
761 already_saved
= client_map
.size() - (event_cmapv
- version
);
762 for (const auto& p
: client_map
) {
763 Session
*s
= get_or_add_session(p
.second
);
764 auto q
= client_metadata_map
.find(p
.first
);
765 if (q
!= client_metadata_map
.end())
766 s
->info
.client_metadata
.merge(q
->second
);
768 if (already_saved
> 0) {
776 set_state(s
, Session::STATE_OPEN
);
777 replay_dirty_session(s
);
782 mds
->clog
->error() << "error replaying open sessions(" << client_map
.size()
783 << ") sessionmap v " << event_cmapv
<< " table " << version
;
784 ceph_assert(g_conf()->mds_wipe_sessions
);
785 mds
->sessionmap
.wipe();
786 mds
->sessionmap
.set_version(event_cmapv
);
789 version_t
SessionMap::mark_projected(Session
*s
)
791 dout(20) << __func__
<< " s=" << s
<< " name=" << s
->info
.inst
.name
792 << " pv=" << projected
<< " -> " << projected
+ 1 << dendl
;
794 s
->push_pv(projected
);
799 class C_IO_SM_Save_One
: public SessionMapIOContext
{
802 C_IO_SM_Save_One(SessionMap
*cm
, MDSContext
*on_safe_
)
803 : SessionMapIOContext(cm
), on_safe(on_safe_
) {}
804 void finish(int r
) override
{
806 get_mds()->handle_write_error(r
);
808 on_safe
->complete(r
);
811 void print(ostream
& out
) const override
{
812 out
<< "session_save_one";
818 void SessionMap::save_if_dirty(const std::set
<entity_name_t
> &tgt_sessions
,
819 MDSGatherBuilder
*gather_bld
)
821 ceph_assert(gather_bld
!= NULL
);
823 std::vector
<entity_name_t
> write_sessions
;
825 // Decide which sessions require a write
826 for (std::set
<entity_name_t
>::iterator i
= tgt_sessions
.begin();
827 i
!= tgt_sessions
.end(); ++i
) {
828 const entity_name_t
&session_id
= *i
;
830 if (session_map
.count(session_id
) == 0) {
831 // Session isn't around any more, never mind.
835 Session
*session
= session_map
[session_id
];
836 if (!session
->has_dirty_completed_requests()) {
837 // Session hasn't had completed_requests
838 // modified since last write, no need to
843 if (dirty_sessions
.count(session_id
) > 0) {
844 // Session is already dirtied, will be written, no
845 // need to pre-empt that.
848 // Okay, passed all our checks, now we write
849 // this session out. The version we write
850 // into the OMAP may now be higher-versioned
851 // than the version in the header, but that's
852 // okay because it's never a problem to have
853 // an overly-fresh copy of a session.
854 write_sessions
.push_back(*i
);
857 dout(4) << __func__
<< ": writing " << write_sessions
.size() << dendl
;
859 // Batch writes into mds_sessionmap_keys_per_op
860 const uint32_t kpo
= g_conf()->mds_sessionmap_keys_per_op
;
861 map
<string
, bufferlist
> to_set
;
862 for (uint32_t i
= 0; i
< write_sessions
.size(); ++i
) {
863 const entity_name_t
&session_id
= write_sessions
[i
];
864 Session
*session
= session_map
[session_id
];
865 session
->clear_dirty_completed_requests();
868 CachedStackStringStream css
;
873 session
->info
.encode(bl
, mds
->mdsmap
->get_up_features());
876 to_set
[css
->str()] = bl
;
878 // Complete this write transaction?
879 if (i
== write_sessions
.size() - 1
880 || i
% kpo
== kpo
- 1) {
883 to_set
.clear(); // clear to start a new transaction
886 object_t oid
= get_object_name();
887 object_locator_t
oloc(mds
->mdsmap
->get_metadata_pool());
888 MDSContext
*on_safe
= gather_bld
->new_sub();
889 mds
->objecter
->mutate(oid
, oloc
, op
, snapc
,
890 ceph::real_clock::now(), 0,
892 new C_IO_SM_Save_One(this, on_safe
),
902 #define dout_prefix *_dout << "Session "
905 * Calculate the length of the `requests` member list,
906 * because elist does not have a size() method.
910 size_t Session::get_request_count() const
913 for (auto p
= requests
.begin(); !p
.end(); ++p
)
919 * Capped in response to a CEPH_MSG_CLIENT_CAPRELEASE message,
920 * with n_caps equal to the number of caps that were released
921 * in the message. Used to update state about how many caps a
922 * client has released since it was last instructed to RECALL_STATE.
924 void Session::notify_cap_release(size_t n_caps
)
926 recall_caps
.hit(-(double)n_caps
);
927 release_caps
.hit(n_caps
);
931 * Called when a CEPH_MSG_CLIENT_SESSION->CEPH_SESSION_RECALL_STATE
932 * message is sent to the client. Update our recall-related state
933 * in order to generate health metrics if the session doesn't see
934 * a commensurate number of calls to ::notify_cap_release
936 uint64_t Session::notify_recall_sent(size_t new_limit
)
938 const auto num_caps
= caps
.size();
939 ceph_assert(new_limit
< num_caps
); // Behaviour of Server::recall_client_state
940 const auto count
= num_caps
-new_limit
;
942 if (recall_limit
!= new_limit
) {
945 new_change
= 0; /* no change! */
948 /* Always hit the session counter as a RECALL message is still sent to the
949 * client and we do not want the MDS to burn its global counter tokens on a
950 * session that is not releasing caps (i.e. allow the session counter to
951 * throttle future RECALL messages).
953 recall_caps_throttle
.hit(count
);
954 recall_caps_throttle2o
.hit(count
);
955 recall_caps
.hit(count
);
960 * Use client metadata to generate a somewhat-friendlier
961 * name for the client than its session ID.
963 * This is *not* guaranteed to be unique, and any machine
964 * consumers of session-related output should always use
965 * the session ID as a primary capacity and use this only
966 * as a presentation hint.
968 void Session::_update_human_name()
970 auto info_client_metadata_entry
= info
.client_metadata
.find("hostname");
971 if (info_client_metadata_entry
!= info
.client_metadata
.end()) {
972 // Happy path, refer to clients by hostname
973 human_name
= info_client_metadata_entry
->second
;
974 if (!info
.auth_name
.has_default_id()) {
975 // When a non-default entity ID is set by the user, assume they
976 // would like to see it in references to the client, if it's
977 // reasonable short. Limit the length because we don't want
978 // to put e.g. uuid-generated names into a "human readable"
980 const int arbitrarily_short
= 16;
981 if (info
.auth_name
.get_id().size() < arbitrarily_short
) {
982 human_name
+= std::string(":") + info
.auth_name
.get_id();
986 // Fallback, refer to clients by ID e.g. client.4567
987 human_name
= stringify(info
.inst
.name
.num());
991 void Session::decode(bufferlist::const_iterator
&p
)
995 free_prealloc_inos
= info
.prealloc_inos
;
997 _update_human_name();
1000 int Session::check_access(CInode
*in
, unsigned mask
,
1001 int caller_uid
, int caller_gid
,
1002 const vector
<uint64_t> *caller_gid_list
,
1003 int new_uid
, int new_gid
)
1006 CInode
*diri
= NULL
;
1008 diri
= in
->get_projected_parent_dn()->get_dir()->get_inode();
1009 if (diri
&& diri
->is_stray()){
1010 path
= in
->get_projected_inode()->stray_prior_path
;
1011 dout(20) << __func__
<< " stray_prior_path " << path
<< dendl
;
1013 in
->make_path_string(path
, true);
1014 dout(20) << __func__
<< " path " << path
<< dendl
;
1017 path
= path
.substr(1); // drop leading /
1019 const auto& inode
= in
->get_inode();
1021 inode
->has_layout() &&
1022 inode
->layout
.pool_ns
.length() &&
1023 !connection
->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
)) {
1024 dout(10) << __func__
<< " client doesn't support FS_FILE_LAYOUT_V2" << dendl
;
1028 if (!auth_caps
.is_capable(path
, inode
->uid
, inode
->gid
, inode
->mode
,
1029 caller_uid
, caller_gid
, caller_gid_list
, mask
,
1032 return -CEPHFS_EACCES
;
1037 // track total and per session load
1038 void SessionMap::hit_session(Session
*session
) {
1039 uint64_t sessions
= get_session_count_in_state(Session::STATE_OPEN
) +
1040 get_session_count_in_state(Session::STATE_STALE
) +
1041 get_session_count_in_state(Session::STATE_CLOSING
);
1042 ceph_assert(sessions
!= 0);
1044 double total_load
= total_load_avg
.hit();
1045 double avg_load
= total_load
/ sessions
;
1047 logger
->set(l_mdssm_total_load
, (uint64_t)total_load
);
1048 logger
->set(l_mdssm_avg_load
, (uint64_t)avg_load
);
1050 session
->hit_session();
1053 void SessionMap::handle_conf_change(const std::set
<std::string
>& changed
)
1055 auto apply_to_open_sessions
= [this](auto f
) {
1056 if (auto it
= by_state
.find(Session::STATE_OPEN
); it
!= by_state
.end()) {
1057 for (const auto &session
: *(it
->second
)) {
1061 if (auto it
= by_state
.find(Session::STATE_STALE
); it
!= by_state
.end()) {
1062 for (const auto &session
: *(it
->second
)) {
1068 if (changed
.count("mds_request_load_average_decay_rate")) {
1069 auto d
= g_conf().get_val
<double>("mds_request_load_average_decay_rate");
1072 total_load_avg
= DecayCounter(d
);
1074 auto mut
= [d
](auto s
) {
1075 s
->set_load_avg_decay_rate(d
);
1077 apply_to_open_sessions(mut
);
1079 if (changed
.count("mds_recall_max_decay_rate")) {
1080 auto d
= g_conf().get_val
<double>("mds_recall_max_decay_rate");
1081 auto mut
= [d
](auto s
) {
1082 s
->recall_caps_throttle
= DecayCounter(d
);
1084 apply_to_open_sessions(mut
);
1086 if (changed
.count("mds_recall_warning_decay_rate")) {
1087 auto d
= g_conf().get_val
<double>("mds_recall_warning_decay_rate");
1088 auto mut
= [d
](auto s
) {
1089 s
->recall_caps
= DecayCounter(d
);
1090 s
->release_caps
= DecayCounter(d
);
1092 apply_to_open_sessions(mut
);
1094 if (changed
.count("mds_session_cache_liveness_decay_rate")) {
1095 auto d
= g_conf().get_val
<double>("mds_session_cache_liveness_decay_rate");
1096 auto mut
= [d
](auto s
) {
1097 s
->session_cache_liveness
= DecayCounter(d
);
1098 s
->session_cache_liveness
.hit(s
->caps
.size()); /* so the MDS doesn't immediately start trimming a new session */
1100 apply_to_open_sessions(mut
);
1102 if (changed
.count("mds_session_cap_acquisition_decay_rate")) {
1103 auto d
= g_conf().get_val
<double>("mds_session_cap_acquisition_decay_rate");
1104 auto mut
= [d
](auto s
) {
1105 s
->cap_acquisition
= DecayCounter(d
);
1107 apply_to_open_sessions(mut
);
1111 void SessionMap::update_average_session_age() {
1112 if (!session_map
.size()) {
1116 double avg_uptime
= std::chrono::duration
<double>(clock::now()-avg_birth_time
).count();
1117 logger
->set(l_mdssm_avg_session_uptime
, (uint64_t)avg_uptime
);
1120 int SessionFilter::parse(
1121 const std::vector
<std::string
> &args
,
1124 ceph_assert(ss
!= NULL
);
1126 for (const auto &s
: args
) {
1127 dout(20) << __func__
<< " parsing filter '" << s
<< "'" << dendl
;
1129 auto eq
= s
.find("=");
1130 if (eq
== std::string::npos
|| eq
== s
.size()) {
1131 // allow this to be a bare id for compatibility with pre-octopus asok
1134 id
= strict_strtoll(s
.c_str(), 10, &err
);
1136 *ss
<< "Invalid filter '" << s
<< "'";
1137 return -CEPHFS_EINVAL
;
1142 // Keys that start with this are to be taken as referring
1143 // to freeform client metadata fields.
1144 const std::string
metadata_prefix("client_metadata.");
1146 auto k
= s
.substr(0, eq
);
1147 auto v
= s
.substr(eq
+ 1);
1149 dout(20) << __func__
<< " parsed k='" << k
<< "', v='" << v
<< "'" << dendl
;
1151 if (k
.compare(0, metadata_prefix
.size(), metadata_prefix
) == 0
1152 && k
.size() > metadata_prefix
.size()) {
1153 // Filter on arbitrary metadata key (no fixed schema for this,
1154 // so anything after the dot is a valid field to filter on)
1155 auto metadata_key
= k
.substr(metadata_prefix
.size());
1156 metadata
.insert(std::make_pair(metadata_key
, v
));
1157 } else if (k
== "auth_name") {
1158 // Filter on client entity name
1160 } else if (k
== "state") {
1162 } else if (k
== "id") {
1164 id
= strict_strtoll(v
.c_str(), 10, &err
);
1167 return -CEPHFS_EINVAL
;
1169 } else if (k
== "reconnecting") {
1172 * Strict boolean parser. Allow true/false/0/1.
1173 * Anything else is -CEPHFS_EINVAL.
1175 auto is_true
= [](std::string_view bstr
, bool *out
) -> bool
1177 ceph_assert(out
!= nullptr);
1179 if (bstr
== "true" || bstr
== "1") {
1182 } else if (bstr
== "false" || bstr
== "0") {
1186 return -CEPHFS_EINVAL
;
1191 int r
= is_true(v
, &bval
);
1193 set_reconnecting(bval
);
1195 *ss
<< "Invalid boolean value '" << v
<< "'";
1196 return -CEPHFS_EINVAL
;
1199 *ss
<< "Invalid filter key '" << k
<< "'";
1200 return -CEPHFS_EINVAL
;
1207 bool SessionFilter::match(
1208 const Session
&session
,
1209 std::function
<bool(client_t
)> is_reconnecting
) const
1211 for (const auto &m
: metadata
) {
1212 const auto &k
= m
.first
;
1213 const auto &v
= m
.second
;
1214 auto it
= session
.info
.client_metadata
.find(k
);
1215 if (it
== session
.info
.client_metadata
.end()) {
1218 if (it
->second
!= v
) {
1223 if (!auth_name
.empty() && auth_name
!= session
.info
.auth_name
.get_id()) {
1227 if (!state
.empty() && state
!= session
.get_state_name()) {
1231 if (id
!= 0 && id
!= session
.info
.inst
.name
.num()) {
1235 if (reconnecting
.first
) {
1236 const bool am_reconnecting
= is_reconnecting(session
.info
.inst
.name
.num());
1237 if (reconnecting
.second
!= am_reconnecting
) {
1245 std::ostream
& operator<<(std::ostream
&out
, const Session
&s
)
1247 if (s
.get_human_name() == stringify(s
.get_client())) {
1248 out
<< s
.get_human_name();
1250 out
<< s
.get_human_name() << " (" << std::dec
<< s
.get_client() << ")";