1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
6 #include <linux/random.h>
7 #include <linux/slab.h>
8 #include <linux/types.h>
10 #include <linux/ceph/mdsmap.h>
11 #include <linux/ceph/messenger.h>
12 #include <linux/ceph/decode.h>
18 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
20 int ceph_mdsmap_get_random_mds(struct ceph_mdsmap
*m
)
25 /* special case for one mds */
26 if (1 == m
->m_num_mds
&& m
->m_info
[0].state
> 0)
30 for (i
= 0; i
< m
->m_num_mds
; i
++)
31 if (m
->m_info
[i
].state
> 0)
37 n
= prandom_u32() % n
;
38 for (j
= 0, i
= 0; i
< m
->m_num_mds
; i
++) {
39 if (m
->m_info
[i
].state
> 0)
48 #define __decode_and_drop_type(p, end, type, bad) \
50 if (*p + sizeof(type) > end) \
55 #define __decode_and_drop_set(p, end, type, bad) \
59 ceph_decode_32_safe(p, end, n, bad); \
60 need = sizeof(type) * n; \
61 ceph_decode_need(p, end, need, bad); \
65 #define __decode_and_drop_map(p, end, ktype, vtype, bad) \
69 ceph_decode_32_safe(p, end, n, bad); \
70 need = (sizeof(ktype) + sizeof(vtype)) * n; \
71 ceph_decode_need(p, end, need, bad); \
76 static int __decode_and_drop_compat_set(void **p
, void* end
)
79 /* compat, ro_compat, incompat*/
80 for (i
= 0; i
< 3; i
++) {
82 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
85 /* names (map<u64, string>) */
86 n
= ceph_decode_32(p
);
89 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
),
92 len
= ceph_decode_32(p
);
93 ceph_decode_need(p
, end
, len
, bad
);
105 * Ignore any fields we don't care about (there are quite a few of
108 struct ceph_mdsmap
*ceph_mdsmap_decode(void **p
, void *end
)
110 struct ceph_mdsmap
*m
;
111 const void *start
= *p
;
114 u8 mdsmap_v
, mdsmap_cv
;
117 m
= kzalloc(sizeof(*m
), GFP_NOFS
);
119 return ERR_PTR(-ENOMEM
);
121 ceph_decode_need(p
, end
, 1 + 1, bad
);
122 mdsmap_v
= ceph_decode_8(p
);
123 mdsmap_cv
= ceph_decode_8(p
);
126 ceph_decode_32_safe(p
, end
, mdsmap_len
, bad
);
127 if (end
< *p
+ mdsmap_len
)
129 end
= *p
+ mdsmap_len
;
132 ceph_decode_need(p
, end
, 8*sizeof(u32
) + sizeof(u64
), bad
);
133 m
->m_epoch
= ceph_decode_32(p
);
134 m
->m_client_epoch
= ceph_decode_32(p
);
135 m
->m_last_failure
= ceph_decode_32(p
);
136 m
->m_root
= ceph_decode_32(p
);
137 m
->m_session_timeout
= ceph_decode_32(p
);
138 m
->m_session_autoclose
= ceph_decode_32(p
);
139 m
->m_max_file_size
= ceph_decode_64(p
);
140 m
->m_max_mds
= ceph_decode_32(p
);
141 m
->m_num_mds
= m
->m_max_mds
;
143 m
->m_info
= kcalloc(m
->m_num_mds
, sizeof(*m
->m_info
), GFP_NOFS
);
147 /* pick out active nodes from mds_info (state > 0) */
148 n
= ceph_decode_32(p
);
149 for (i
= 0; i
< n
; i
++) {
155 void *info_end
= NULL
;
156 struct ceph_entity_addr addr
;
157 u32 num_export_targets
;
158 void *pexport_targets
= NULL
;
159 struct ceph_timespec laggy_since
;
160 struct ceph_mds_info
*info
;
163 ceph_decode_need(p
, end
, sizeof(u64
) + 1, bad
);
164 global_id
= ceph_decode_64(p
);
165 info_v
= ceph_decode_8(p
);
169 ceph_decode_need(p
, end
, 1 + sizeof(u32
), bad
);
170 info_cv
= ceph_decode_8(p
);
171 info_len
= ceph_decode_32(p
);
172 info_end
= *p
+ info_len
;
177 ceph_decode_need(p
, end
, sizeof(u64
) + sizeof(u32
), bad
);
179 namelen
= ceph_decode_32(p
); /* skip mds name */
182 ceph_decode_need(p
, end
,
183 4*sizeof(u32
) + sizeof(u64
) +
184 sizeof(addr
) + sizeof(struct ceph_timespec
),
186 mds
= ceph_decode_32(p
);
187 inc
= ceph_decode_32(p
);
188 state
= ceph_decode_32(p
);
189 state_seq
= ceph_decode_64(p
);
190 err
= ceph_decode_entity_addr(p
, end
, &addr
);
193 ceph_decode_copy(p
, &laggy_since
, sizeof(laggy_since
));
194 laggy
= laggy_since
.tv_sec
!= 0 || laggy_since
.tv_nsec
!= 0;
196 ceph_decode_32_safe(p
, end
, namelen
, bad
);
199 ceph_decode_32_safe(p
, end
, num_export_targets
, bad
);
200 pexport_targets
= *p
;
201 *p
+= num_export_targets
* sizeof(u32
);
203 num_export_targets
= 0;
206 if (info_end
&& *p
!= info_end
) {
212 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
213 i
+1, n
, global_id
, mds
, inc
,
215 ceph_mds_state_name(state
),
216 laggy
? "(laggy)" : "");
218 if (mds
< 0 || state
<= 0)
221 if (mds
>= m
->m_num_mds
) {
222 int new_num
= max(mds
+ 1, m
->m_num_mds
* 2);
223 void *new_m_info
= krealloc(m
->m_info
,
224 new_num
* sizeof(*m
->m_info
),
225 GFP_NOFS
| __GFP_ZERO
);
228 m
->m_info
= new_m_info
;
229 m
->m_num_mds
= new_num
;
232 info
= &m
->m_info
[mds
];
233 info
->global_id
= global_id
;
237 info
->num_export_targets
= num_export_targets
;
238 if (num_export_targets
) {
239 info
->export_targets
= kcalloc(num_export_targets
,
240 sizeof(u32
), GFP_NOFS
);
241 if (!info
->export_targets
)
243 for (j
= 0; j
< num_export_targets
; j
++)
244 info
->export_targets
[j
] =
245 ceph_decode_32(&pexport_targets
);
247 info
->export_targets
= NULL
;
250 if (m
->m_num_mds
> m
->m_max_mds
) {
251 /* find max up mds */
252 for (i
= m
->m_num_mds
; i
>= m
->m_max_mds
; i
--) {
253 if (i
== 0 || m
->m_info
[i
-1].state
> 0)
260 ceph_decode_32_safe(p
, end
, n
, bad
);
261 m
->m_num_data_pg_pools
= n
;
262 m
->m_data_pg_pools
= kcalloc(n
, sizeof(u64
), GFP_NOFS
);
263 if (!m
->m_data_pg_pools
)
265 ceph_decode_need(p
, end
, sizeof(u64
)*(n
+1), bad
);
266 for (i
= 0; i
< n
; i
++)
267 m
->m_data_pg_pools
[i
] = ceph_decode_64(p
);
268 m
->m_cas_pg_pool
= ceph_decode_64(p
);
269 m
->m_enabled
= m
->m_epoch
> 1;
273 ceph_decode_16_safe(p
, end
, mdsmap_ev
, bad_ext
);
275 if (mdsmap_ev
>= 3) {
276 if (__decode_and_drop_compat_set(p
, end
) < 0)
281 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
283 __decode_and_drop_type(p
, end
, u64
, bad_ext
);
286 /* created + modified + tableserver */
287 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
288 __decode_and_drop_type(p
, end
, struct ceph_timespec
, bad_ext
);
289 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
294 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
295 ceph_decode_need(p
, end
, sizeof(u32
) * n
, bad_ext
);
297 for (i
= 0; i
< n
; i
++) {
298 s32 mds
= ceph_decode_32(p
);
299 if (mds
>= 0 && mds
< m
->m_num_mds
) {
300 if (m
->m_info
[mds
].laggy
)
304 m
->m_num_laggy
= num_laggy
;
306 if (n
> m
->m_num_mds
) {
307 void *new_m_info
= krealloc(m
->m_info
,
308 n
* sizeof(*m
->m_info
),
309 GFP_NOFS
| __GFP_ZERO
);
312 m
->m_info
= new_m_info
;
318 __decode_and_drop_map(p
, end
, u32
, u32
, bad_ext
);
320 __decode_and_drop_map(p
, end
, u32
, u64
, bad_ext
);
322 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
324 __decode_and_drop_set(p
, end
, u32
, bad_ext
);
326 if (mdsmap_ev
>= 4) {
327 /* last_failure_osd_epoch */
328 __decode_and_drop_type(p
, end
, u32
, bad_ext
);
330 if (mdsmap_ev
>= 6) {
331 /* ever_allowed_snaps */
332 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
333 /* explicitly_allowed_snaps */
334 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
336 if (mdsmap_ev
>= 7) {
337 /* inline_data_enabled */
338 __decode_and_drop_type(p
, end
, u8
, bad_ext
);
340 if (mdsmap_ev
>= 8) {
343 ceph_decode_8_safe(p
, end
, m
->m_enabled
, bad_ext
);
344 ceph_decode_32_safe(p
, end
, name_len
, bad_ext
);
345 ceph_decode_need(p
, end
, name_len
, bad_ext
);
349 if (mdsmap_ev
>= 9) {
351 ceph_decode_32_safe(p
, end
, n
, bad_ext
);
352 need
= sizeof(u32
) * n
;
353 ceph_decode_need(p
, end
, need
, bad_ext
);
355 m
->m_damaged
= n
> 0;
357 m
->m_damaged
= false;
360 dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
361 !!m
->m_enabled
, !!m
->m_damaged
, m
->m_num_laggy
);
363 dout("mdsmap_decode success epoch %u\n", m
->m_epoch
);
369 pr_err("corrupt mdsmap\n");
370 print_hex_dump(KERN_DEBUG
, "mdsmap: ",
371 DUMP_PREFIX_OFFSET
, 16, 1,
372 start
, end
- start
, true);
374 ceph_mdsmap_destroy(m
);
381 void ceph_mdsmap_destroy(struct ceph_mdsmap
*m
)
385 for (i
= 0; i
< m
->m_num_mds
; i
++)
386 kfree(m
->m_info
[i
].export_targets
);
388 kfree(m
->m_data_pg_pools
);
392 bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap
*m
)
394 int i
, nr_active
= 0;
399 if (m
->m_num_laggy
> 0)
401 for (i
= 0; i
< m
->m_num_mds
; i
++) {
402 if (m
->m_info
[i
].state
== CEPH_MDS_STATE_ACTIVE
)
405 return nr_active
> 0;