1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
6 #include <linux/random.h>
7 #include <linux/slab.h>
8 #include <linux/types.h>
10 #include <linux/ceph/mdsmap.h>
11 #include <linux/ceph/messenger.h>
12 #include <linux/ceph/decode.h>
16 #define CEPH_MDS_IS_READY(i, ignore_laggy) \
17 (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
19 static int __mdsmap_get_random_mds(struct ceph_mdsmap
*m
, bool ignore_laggy
)
25 for (i
= 0; i
< m
->possible_max_rank
; i
++)
26 if (CEPH_MDS_IS_READY(i
, ignore_laggy
))
32 n
= prandom_u32() % n
;
33 for (j
= 0, i
= 0; i
< m
->possible_max_rank
; i
++) {
34 if (CEPH_MDS_IS_READY(i
, ignore_laggy
))
44 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
46 int ceph_mdsmap_get_random_mds(struct ceph_mdsmap
*m
)
50 mds
= __mdsmap_get_random_mds(m
, false);
51 if (mds
== m
->possible_max_rank
|| mds
== -1)
52 mds
= __mdsmap_get_random_mds(m
, true);
54 return mds
== m
->possible_max_rank
? -1 : mds
;
57 #define __decode_and_drop_type(p, end, type, bad) \
59 if (*p + sizeof(type) > end) \
64 #define __decode_and_drop_set(p, end, type, bad) \
68 ceph_decode_32_safe(p, end, n, bad); \
69 need = sizeof(type) * n; \
70 ceph_decode_need(p, end, need, bad); \
74 #define __decode_and_drop_map(p, end, ktype, vtype, bad) \
78 ceph_decode_32_safe(p, end, n, bad); \
79 need = (sizeof(ktype) + sizeof(vtype)) * n; \
80 ceph_decode_need(p, end, need, bad); \
85 static int __decode_and_drop_compat_set(void **p
, void* end
)
88 /* compat, ro_compat, incompat*/
89 for (i
= 0; i
< 3; i
++) {
91 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
94 /* names (map<u64, string>) */
95 n
= ceph_decode_32(p
);
98 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
),
101 len
= ceph_decode_32(p
);
102 ceph_decode_need(p
, end
, len
, bad
);
114 * Ignore any fields we don't care about (there are quite a few of
117 struct ceph_mdsmap
*ceph_mdsmap_decode(void **p
, void *end
, bool msgr2
)
119 struct ceph_mdsmap
*m
;
120 const void *start
= *p
;
127 m
= kzalloc(sizeof(*m
), GFP_NOFS
);
129 return ERR_PTR(-ENOMEM
);
131 ceph_decode_need(p
, end
, 1 + 1, bad
);
132 mdsmap_v
= ceph_decode_8(p
);
133 *p
+= sizeof(u8
); /* mdsmap_cv */
136 ceph_decode_32_safe(p
, end
, mdsmap_len
, bad
);
137 if (end
< *p
+ mdsmap_len
)
139 end
= *p
+ mdsmap_len
;
142 ceph_decode_need(p
, end
, 8*sizeof(u32
) + sizeof(u64
), bad
);
143 m
->m_epoch
= ceph_decode_32(p
);
144 m
->m_client_epoch
= ceph_decode_32(p
);
145 m
->m_last_failure
= ceph_decode_32(p
);
146 m
->m_root
= ceph_decode_32(p
);
147 m
->m_session_timeout
= ceph_decode_32(p
);
148 m
->m_session_autoclose
= ceph_decode_32(p
);
149 m
->m_max_file_size
= ceph_decode_64(p
);
150 m
->m_max_mds
= ceph_decode_32(p
);
153 * pick out the active nodes as the m_num_active_mds, the
154 * m_num_active_mds maybe larger than m_max_mds when decreasing
155 * the max_mds in cluster side, in other case it should less
156 * than or equal to m_max_mds.
158 m
->m_num_active_mds
= n
= ceph_decode_32(p
);
161 * the possible max rank, it maybe larger than the m_num_active_mds,
162 * for example if the mds_max == 2 in the cluster, when the MDS(0)
163 * was laggy and being replaced by a new MDS, we will temporarily
164 * receive a new mds map with n_num_mds == 1 and the active MDS(1),
165 * and the mds rank >= m_num_active_mds.
167 m
->possible_max_rank
= max(m
->m_num_active_mds
, m
->m_max_mds
);
169 m
->m_info
= kcalloc(m
->possible_max_rank
, sizeof(*m
->m_info
), GFP_NOFS
);
173 /* pick out active nodes from mds_info (state > 0) */
174 for (i
= 0; i
< n
; i
++) {
179 void *info_end
= NULL
;
180 struct ceph_entity_addr addr
;
181 u32 num_export_targets
;
182 void *pexport_targets
= NULL
;
183 struct ceph_timespec laggy_since
;
184 struct ceph_mds_info
*info
;
187 ceph_decode_need(p
, end
, sizeof(u64
) + 1, bad
);
188 global_id
= ceph_decode_64(p
);
189 info_v
= ceph_decode_8(p
);
192 ceph_decode_need(p
, end
, 1 + sizeof(u32
), bad
);
193 *p
+= sizeof(u8
); /* info_cv */
194 info_len
= ceph_decode_32(p
);
195 info_end
= *p
+ info_len
;
200 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
202 namelen
= ceph_decode_32(p
); /* skip mds name */
205 ceph_decode_32_safe(p
, end
, mds
, bad
);
206 ceph_decode_32_safe(p
, end
, inc
, bad
);
207 ceph_decode_32_safe(p
, end
, state
, bad
);
208 *p
+= sizeof(u64
); /* state_seq */
210 err
= ceph_decode_entity_addrvec(p
, end
, msgr2
, &addr
);
212 err
= ceph_decode_entity_addr(p
, end
, &addr
);
216 ceph_decode_copy_safe(p
, end
, &laggy_since
, sizeof(laggy_since
),
218 laggy
= laggy_since
.tv_sec
!= 0 || laggy_since
.tv_nsec
!= 0;
220 ceph_decode_32_safe(p
, end
, namelen
, bad
);
223 ceph_decode_32_safe(p
, end
, num_export_targets
, bad
);
224 pexport_targets
= *p
;
225 *p
+= num_export_targets
* sizeof(u32
);
227 num_export_targets
= 0;
230 if (info_end
&& *p
!= info_end
) {
236 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
237 i
+1, n
, global_id
, mds
, inc
,
239 ceph_mds_state_name(state
),
240 laggy
? "(laggy)" : "");
242 if (mds
< 0 || mds
>= m
->possible_max_rank
) {
243 pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds
);
248 dout("mdsmap_decode got incorrect state(%s)\n",
249 ceph_mds_state_name(state
));
253 info
= &m
->m_info
[mds
];
254 info
->global_id
= global_id
;
258 info
->num_export_targets
= num_export_targets
;
259 if (num_export_targets
) {
260 info
->export_targets
= kcalloc(num_export_targets
,
261 sizeof(u32
), GFP_NOFS
);
262 if (!info
->export_targets
)
264 for (j
= 0; j
< num_export_targets
; j
++) {
265 target
= ceph_decode_32(&pexport_targets
);
266 if (target
>= m
->possible_max_rank
) {
270 info
->export_targets
[j
] = target
;
273 info
->export_targets
= NULL
;
278 ceph_decode_32_safe(p
, end
, n
, bad
);
279 m
->m_num_data_pg_pools
= n
;
280 m
->m_data_pg_pools
= kcalloc(n
, sizeof(u64
), GFP_NOFS
);
281 if (!m
->m_data_pg_pools
)
283 ceph_decode_need(p
, end
, sizeof(u64
)*(n
+1), bad
);
284 for (i
= 0; i
< n
; i
++)
285 m
->m_data_pg_pools
[i
] = ceph_decode_64(p
);
286 m
->m_cas_pg_pool
= ceph_decode_64(p
);
287 m
->m_enabled
= m
->m_epoch
> 1;
291 ceph_decode_16_safe(p
, end
, mdsmap_ev
, bad_ext
);
293 if (mdsmap_ev
>= 3) {
294 if (__decode_and_drop_compat_set(p
, end
) < 0)
299 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
301 __decode_and_drop_type(p
, end
, u64
, bad_ext
);
304 /* created + modified + tableserver */
305 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
306 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
307 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
312 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
313 ceph_decode_need(p
, end
, sizeof(u32
) * n
, bad_ext
);
315 for (i
= 0; i
< n
; i
++) {
316 s32 mds
= ceph_decode_32(p
);
317 if (mds
>= 0 && mds
< m
->possible_max_rank
) {
318 if (m
->m_info
[mds
].laggy
)
322 m
->m_num_laggy
= num_laggy
;
324 if (n
> m
->possible_max_rank
) {
325 void *new_m_info
= krealloc(m
->m_info
,
326 n
* sizeof(*m
->m_info
),
327 GFP_NOFS
| __GFP_ZERO
);
330 m
->m_info
= new_m_info
;
332 m
->possible_max_rank
= n
;
336 __decode_and_drop_map(p
, end
, u32
, u32
, bad_ext
);
338 __decode_and_drop_map(p
, end
, u32
, u64
, bad_ext
);
340 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
342 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
344 if (mdsmap_ev
>= 4) {
345 /* last_failure_osd_epoch */
346 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
348 if (mdsmap_ev
>= 6) {
349 /* ever_allowed_snaps */
350 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
351 /* explicitly_allowed_snaps */
352 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
354 if (mdsmap_ev
>= 7) {
355 /* inline_data_enabled */
356 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
358 if (mdsmap_ev
>= 8) {
361 ceph_decode_8_safe(p
, end
, m
->m_enabled
, bad_ext
);
362 ceph_decode_32_safe(p
, end
, name_len
, bad_ext
);
363 ceph_decode_need(p
, end
, name_len
, bad_ext
);
367 if (mdsmap_ev
>= 9) {
369 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
370 need
= sizeof(u32
) * n
;
371 ceph_decode_need(p
, end
, need
, bad_ext
);
373 m
->m_damaged
= n
> 0;
375 m
->m_damaged
= false;
378 dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
379 !!m
->m_enabled
, !!m
->m_damaged
, m
->m_num_laggy
);
381 dout("mdsmap_decode success epoch %u\n", m
->m_epoch
);
387 pr_err("corrupt mdsmap\n");
388 print_hex_dump(KERN_DEBUG
, "mdsmap: ",
389 DUMP_PREFIX_OFFSET
, 16, 1,
390 start
, end
- start
, true);
392 ceph_mdsmap_destroy(m
);
399 void ceph_mdsmap_destroy(struct ceph_mdsmap
*m
)
404 for (i
= 0; i
< m
->possible_max_rank
; i
++)
405 kfree(m
->m_info
[i
].export_targets
);
408 kfree(m
->m_data_pg_pools
);
412 bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap
*m
)
414 int i
, nr_active
= 0;
419 if (m
->m_num_laggy
== m
->m_num_active_mds
)
421 for (i
= 0; i
< m
->possible_max_rank
; i
++) {
422 if (m
->m_info
[i
].state
== CEPH_MDS_STATE_ACTIVE
)
425 return nr_active
> 0;