1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2012 Inktank
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include "include/types.h"
18 #include "include/buffer.h"
19 #include "osd/OSDMap.h"
21 #include "common/config.h"
22 #include "common/debug.h"
24 #define dout_subsys ceph_subsys_striper
26 #define dout_prefix *_dout << "striper "
32 using ceph::bufferlist
;
36 object_t
format_oid(const char* object_format
, uint64_t object_no
) {
37 char buf
[strlen(object_format
) + 32];
38 snprintf(buf
, sizeof(buf
), object_format
, (long long unsigned)object_no
);
42 struct OrderByObject
{
43 constexpr bool operator()(uint64_t object_no
,
44 const striper::LightweightObjectExtent
& rhs
) const {
45 return object_no
< rhs
.object_no
;
47 constexpr bool operator()(const striper::LightweightObjectExtent
& lhs
,
48 uint64_t object_no
) const {
49 return lhs
.object_no
< object_no
;
54 void add_partial_sparse_result(
56 std::map
<uint64_t, std::pair
<ceph::buffer::list
, uint64_t> >* partial
,
57 uint64_t* total_intended_len
, bufferlist
& bl
, I
* it
, const I
& end_it
,
58 uint64_t* bl_off
, uint64_t tofs
, uint64_t tlen
) {
59 ldout(cct
, 30) << " be " << tofs
<< "~" << tlen
<< dendl
;
63 ldout(cct
, 20) << " t " << tofs
<< "~" << tlen
64 << " bl has " << bl
.length()
65 << " off " << *bl_off
<< dendl
;
67 ldout(cct
, 20) << " s at end" << dendl
;
68 auto& r
= (*partial
)[tofs
];
70 *total_intended_len
+= r
.second
;
74 ldout(cct
, 30) << " s " << s
->first
<< "~" << s
->second
<< dendl
;
76 // skip zero-length extent
78 ldout(cct
, 30) << " s len 0, skipping" << dendl
;
83 if (s
->first
> *bl_off
) {
84 // gap in sparse read result
85 pair
<bufferlist
, uint64_t>& r
= (*partial
)[tofs
];
86 size_t gap
= std::min
<size_t>(s
->first
- *bl_off
, tlen
);
87 ldout(cct
, 20) << " s gap " << gap
<< ", skipping" << dendl
;
89 *total_intended_len
+= r
.second
;
98 ceph_assert(s
->first
<= *bl_off
);
99 size_t left
= (s
->first
+ s
->second
) - *bl_off
;
100 size_t actual
= std::min
<size_t>(left
, tlen
);
103 ldout(cct
, 20) << " s has " << actual
<< ", copying" << dendl
;
104 pair
<bufferlist
, uint64_t>& r
= (*partial
)[tofs
];
105 bl
.splice(0, actual
, &r
.first
);
107 *total_intended_len
+= r
.second
;
112 if (actual
== left
) {
113 ldout(cct
, 30) << " s advancing" << dendl
;
119 } // anonymous namespace
121 void Striper::file_to_extents(CephContext
*cct
, const char *object_format
,
122 const file_layout_t
*layout
,
123 uint64_t offset
, uint64_t len
,
125 std::vector
<ObjectExtent
>& extents
,
126 uint64_t buffer_offset
)
128 striper::LightweightObjectExtents lightweight_object_extents
;
129 file_to_extents(cct
, layout
, offset
, len
, trunc_size
, buffer_offset
,
130 &lightweight_object_extents
);
132 // convert lightweight object extents to heavyweight version
133 extents
.reserve(lightweight_object_extents
.size());
134 for (auto& lightweight_object_extent
: lightweight_object_extents
) {
135 auto& object_extent
= extents
.emplace_back(
136 object_t(format_oid(object_format
, lightweight_object_extent
.object_no
)),
137 lightweight_object_extent
.object_no
,
138 lightweight_object_extent
.offset
, lightweight_object_extent
.length
,
139 lightweight_object_extent
.truncate_size
);
141 object_extent
.oloc
= OSDMap::file_to_object_locator(*layout
);
142 object_extent
.buffer_extents
.reserve(
143 lightweight_object_extent
.buffer_extents
.size());
144 object_extent
.buffer_extents
.insert(
145 object_extent
.buffer_extents
.end(),
146 lightweight_object_extent
.buffer_extents
.begin(),
147 lightweight_object_extent
.buffer_extents
.end());
151 void Striper::file_to_extents(
152 CephContext
*cct
, const char *object_format
,
153 const file_layout_t
*layout
,
154 uint64_t offset
, uint64_t len
,
156 map
<object_t
,std::vector
<ObjectExtent
> >& object_extents
,
157 uint64_t buffer_offset
)
159 striper::LightweightObjectExtents lightweight_object_extents
;
160 file_to_extents(cct
, layout
, offset
, len
, trunc_size
, buffer_offset
,
161 &lightweight_object_extents
);
163 // convert lightweight object extents to heavyweight version
164 for (auto& lightweight_object_extent
: lightweight_object_extents
) {
165 auto oid
= format_oid(object_format
, lightweight_object_extent
.object_no
);
166 auto& object_extent
= object_extents
[oid
].emplace_back(
167 oid
, lightweight_object_extent
.object_no
,
168 lightweight_object_extent
.offset
, lightweight_object_extent
.length
,
169 lightweight_object_extent
.truncate_size
);
171 object_extent
.oloc
= OSDMap::file_to_object_locator(*layout
);
172 object_extent
.buffer_extents
.reserve(
173 lightweight_object_extent
.buffer_extents
.size());
174 object_extent
.buffer_extents
.insert(
175 object_extent
.buffer_extents
.end(),
176 lightweight_object_extent
.buffer_extents
.begin(),
177 lightweight_object_extent
.buffer_extents
.end());
181 void Striper::file_to_extents(
182 CephContext
*cct
, const file_layout_t
*layout
, uint64_t offset
,
183 uint64_t len
, uint64_t trunc_size
, uint64_t buffer_offset
,
184 striper::LightweightObjectExtents
* object_extents
) {
185 ldout(cct
, 10) << "file_to_extents " << offset
<< "~" << len
<< dendl
;
186 ceph_assert(len
> 0);
189 * we want only one extent per object! this means that each extent
190 * we read may map into different bits of the final read
191 * buffer.. hence buffer_extents
194 __u32 object_size
= layout
->object_size
;
195 __u32 su
= layout
->stripe_unit
;
196 __u32 stripe_count
= layout
->stripe_count
;
197 ceph_assert(object_size
>= su
);
198 if (stripe_count
== 1) {
199 ldout(cct
, 20) << " sc is one, reset su to os" << dendl
;
202 uint64_t stripes_per_object
= object_size
/ su
;
203 ldout(cct
, 20) << " su " << su
<< " sc " << stripe_count
<< " os "
204 << object_size
<< " stripes_per_object " << stripes_per_object
207 uint64_t cur
= offset
;
210 // layout into objects
211 uint64_t blockno
= cur
/ su
; // which block
212 // which horizontal stripe (Y)
213 uint64_t stripeno
= blockno
/ stripe_count
;
214 // which object in the object set (X)
215 uint64_t stripepos
= blockno
% stripe_count
;
217 uint64_t objectsetno
= stripeno
/ stripes_per_object
;
219 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
;
221 // map range into object
222 uint64_t block_start
= (stripeno
% stripes_per_object
) * su
;
223 uint64_t block_off
= cur
% su
;
224 uint64_t max
= su
- block_off
;
226 uint64_t x_offset
= block_start
+ block_off
;
233 ldout(cct
, 20) << " off " << cur
<< " blockno " << blockno
<< " stripeno "
234 << stripeno
<< " stripepos " << stripepos
<< " objectsetno "
235 << objectsetno
<< " objectno " << objectno
236 << " block_start " << block_start
<< " block_off "
237 << block_off
<< " " << x_offset
<< "~" << x_len
240 striper::LightweightObjectExtent
* ex
= nullptr;
241 auto it
= std::upper_bound(object_extents
->begin(), object_extents
->end(),
242 objectno
, OrderByObject());
243 striper::LightweightObjectExtents::reverse_iterator
rev_it(it
);
244 if (rev_it
== object_extents
->rend() ||
245 rev_it
->object_no
!= objectno
||
246 rev_it
->offset
+ rev_it
->length
!= x_offset
) {
247 // expect up to "stripe-width - 1" vector shifts in the worst-case
248 ex
= &(*object_extents
->emplace(
249 it
, objectno
, x_offset
, x_len
,
250 object_truncate_size(cct
, layout
, objectno
, trunc_size
)));
251 ldout(cct
, 20) << " added new " << *ex
<< dendl
;
254 ceph_assert(ex
->offset
+ ex
->length
== x_offset
);
256 ldout(cct
, 20) << " adding in to " << *ex
<< dendl
;
260 ex
->buffer_extents
.emplace_back(cur
- offset
+ buffer_offset
, x_len
);
262 ldout(cct
, 15) << "file_to_extents " << *ex
<< dendl
;
263 // ldout(cct, 0) << "map: ino " << ino << " oid " << ex.oid << " osd "
264 // << ex.osd << " offset " << ex.offset << " len " << ex.len
265 // << " ... left " << left << dendl;
272 void Striper::extent_to_file(CephContext
*cct
, file_layout_t
*layout
,
273 uint64_t objectno
, uint64_t off
, uint64_t len
,
274 std::vector
<pair
<uint64_t, uint64_t> >& extents
)
276 ldout(cct
, 10) << "extent_to_file " << objectno
<< " " << off
<< "~"
279 __u32 object_size
= layout
->object_size
;
280 __u32 su
= layout
->stripe_unit
;
281 __u32 stripe_count
= layout
->stripe_count
;
282 ceph_assert(object_size
>= su
);
283 uint64_t stripes_per_object
= object_size
/ su
;
284 ldout(cct
, 20) << " stripes_per_object " << stripes_per_object
<< dendl
;
286 uint64_t off_in_block
= off
% su
;
288 extents
.reserve(len
/ su
+ 1);
291 uint64_t stripepos
= objectno
% stripe_count
;
292 uint64_t objectsetno
= objectno
/ stripe_count
;
293 uint64_t stripeno
= off
/ su
+ objectsetno
* stripes_per_object
;
294 uint64_t blockno
= stripeno
* stripe_count
+ stripepos
;
295 uint64_t extent_off
= blockno
* su
+ off_in_block
;
296 uint64_t extent_len
= std::min(len
, su
- off_in_block
);
297 extents
.push_back(make_pair(extent_off
, extent_len
));
299 ldout(cct
, 20) << " object " << off
<< "~" << extent_len
300 << " -> file " << extent_off
<< "~" << extent_len
309 uint64_t Striper::object_truncate_size(CephContext
*cct
,
310 const file_layout_t
*layout
,
311 uint64_t objectno
, uint64_t trunc_size
)
313 uint64_t obj_trunc_size
;
314 if (trunc_size
== 0 || trunc_size
== (uint64_t)-1) {
315 obj_trunc_size
= trunc_size
;
317 __u32 object_size
= layout
->object_size
;
318 __u32 su
= layout
->stripe_unit
;
319 __u32 stripe_count
= layout
->stripe_count
;
320 ceph_assert(object_size
>= su
);
321 uint64_t stripes_per_object
= object_size
/ su
;
323 uint64_t objectsetno
= objectno
/ stripe_count
;
324 uint64_t trunc_objectsetno
= trunc_size
/ object_size
/ stripe_count
;
325 if (objectsetno
> trunc_objectsetno
)
327 else if (objectsetno
< trunc_objectsetno
)
328 obj_trunc_size
= object_size
;
330 uint64_t trunc_blockno
= trunc_size
/ su
;
331 uint64_t trunc_stripeno
= trunc_blockno
/ stripe_count
;
332 uint64_t trunc_stripepos
= trunc_blockno
% stripe_count
;
333 uint64_t trunc_objectno
= trunc_objectsetno
* stripe_count
335 if (objectno
< trunc_objectno
)
336 obj_trunc_size
= ((trunc_stripeno
% stripes_per_object
) + 1) * su
;
337 else if (objectno
> trunc_objectno
)
338 obj_trunc_size
= (trunc_stripeno
% stripes_per_object
) * su
;
340 obj_trunc_size
= (trunc_stripeno
% stripes_per_object
) * su
344 ldout(cct
, 20) << "object_truncate_size " << objectno
<< " "
345 << trunc_size
<< "->" << obj_trunc_size
<< dendl
;
346 return obj_trunc_size
;
349 uint64_t Striper::get_num_objects(const file_layout_t
& layout
,
352 __u32 stripe_unit
= layout
.stripe_unit
;
353 __u32 stripe_count
= layout
.stripe_count
;
354 uint64_t period
= layout
.get_period();
355 uint64_t num_periods
= (size
+ period
- 1) / period
;
356 uint64_t remainder_bytes
= size
% period
;
357 uint64_t remainder_objs
= 0;
358 if ((remainder_bytes
> 0) && (remainder_bytes
< (uint64_t)stripe_count
360 remainder_objs
= stripe_count
- ((remainder_bytes
+ stripe_unit
- 1)
362 return num_periods
* stripe_count
- remainder_objs
;
365 uint64_t Striper::get_file_offset(CephContext
*cct
,
366 const file_layout_t
*layout
, uint64_t objectno
, uint64_t off
) {
367 ldout(cct
, 10) << "get_file_offset " << objectno
<< " " << off
<< dendl
;
369 __u32 object_size
= layout
->object_size
;
370 __u32 su
= layout
->stripe_unit
;
371 __u32 stripe_count
= layout
->stripe_count
;
372 ceph_assert(object_size
>= su
);
373 uint64_t stripes_per_object
= object_size
/ su
;
374 ldout(cct
, 20) << " stripes_per_object " << stripes_per_object
<< dendl
;
376 uint64_t off_in_block
= off
% su
;
378 uint64_t stripepos
= objectno
% stripe_count
;
379 uint64_t objectsetno
= objectno
/ stripe_count
;
380 uint64_t stripeno
= off
/ su
+ objectsetno
* stripes_per_object
;
381 uint64_t blockno
= stripeno
* stripe_count
+ stripepos
;
382 return blockno
* su
+ off_in_block
;
387 void Striper::StripedReadResult::add_partial_result(
388 CephContext
*cct
, bufferlist
& bl
,
389 const std::vector
<pair
<uint64_t,uint64_t> >& buffer_extents
)
391 ldout(cct
, 10) << "add_partial_result(" << this << ") " << bl
.length()
392 << " to " << buffer_extents
<< dendl
;
393 for (auto p
= buffer_extents
.cbegin(); p
!= buffer_extents
.cend(); ++p
) {
394 pair
<bufferlist
, uint64_t>& r
= partial
[p
->first
];
395 size_t actual
= std::min
<uint64_t>(bl
.length(), p
->second
);
396 bl
.splice(0, actual
, &r
.first
);
397 r
.second
= p
->second
;
398 total_intended_len
+= r
.second
;
402 void Striper::StripedReadResult::add_partial_result(
403 CephContext
*cct
, bufferlist
&& bl
,
404 const striper::LightweightBufferExtents
& buffer_extents
)
406 ldout(cct
, 10) << "add_partial_result(" << this << ") " << bl
.length()
407 << " to " << buffer_extents
<< dendl
;
408 for (auto& be
: buffer_extents
) {
409 auto& r
= partial
[be
.first
];
410 size_t actual
= std::min
<uint64_t>(bl
.length(), be
.second
);
411 if (buffer_extents
.size() == 1) {
412 r
.first
= std::move(bl
);
414 bl
.splice(0, actual
, &r
.first
);
416 r
.second
= be
.second
;
417 total_intended_len
+= r
.second
;
421 void Striper::StripedReadResult::add_partial_sparse_result(
422 CephContext
*cct
, bufferlist
& bl
, const map
<uint64_t, uint64_t>& bl_map
,
423 uint64_t bl_off
, const std::vector
<pair
<uint64_t,uint64_t> >& buffer_extents
)
425 ldout(cct
, 10) << "add_partial_sparse_result(" << this << ") " << bl
.length()
426 << " covering " << bl_map
<< " (offset " << bl_off
<< ")"
427 << " to " << buffer_extents
<< dendl
;
429 if (bl_map
.empty()) {
430 add_partial_result(cct
, bl
, buffer_extents
);
434 auto s
= bl_map
.cbegin();
435 for (auto& be
: buffer_extents
) {
436 ::add_partial_sparse_result(cct
, &partial
, &total_intended_len
, bl
, &s
,
437 bl_map
.end(), &bl_off
, be
.first
, be
.second
);
441 void Striper::StripedReadResult::add_partial_sparse_result(
442 CephContext
*cct
, ceph::buffer::list
&& bl
,
443 const std::vector
<std::pair
<uint64_t, uint64_t>>& bl_map
, uint64_t bl_off
,
444 const striper::LightweightBufferExtents
& buffer_extents
) {
445 ldout(cct
, 10) << "add_partial_sparse_result(" << this << ") " << bl
.length()
446 << " covering " << bl_map
<< " (offset " << bl_off
<< ")"
447 << " to " << buffer_extents
<< dendl
;
449 if (bl_map
.empty()) {
450 add_partial_result(cct
, std::move(bl
), buffer_extents
);
454 auto s
= bl_map
.cbegin();
455 for (auto& be
: buffer_extents
) {
456 ::add_partial_sparse_result(cct
, &partial
, &total_intended_len
, bl
, &s
,
457 bl_map
.cend(), &bl_off
, be
.first
, be
.second
);
461 void Striper::StripedReadResult::assemble_result(CephContext
*cct
,
465 ldout(cct
, 10) << "assemble_result(" << this << ") zero_tail=" << zero_tail
467 size_t zeros
= 0; // zeros preceding current position
468 for (auto& p
: partial
) {
469 size_t got
= p
.second
.first
.length();
470 size_t expect
= p
.second
.second
;
473 bl
.append_zero(zeros
);
476 bl
.claim_append(p
.second
.first
);
478 zeros
+= expect
- got
;
480 if (zero_tail
&& zeros
) {
481 bl
.append_zero(zeros
);
486 void Striper::StripedReadResult::assemble_result(CephContext
*cct
, char *buffer
, size_t length
)
489 ceph_assert(buffer
&& length
== total_intended_len
);
491 map
<uint64_t,pair
<bufferlist
,uint64_t> >::reverse_iterator p
= partial
.rbegin();
492 if (p
== partial
.rend())
495 uint64_t curr
= length
;
496 uint64_t end
= p
->first
+ p
->second
.second
;
497 while (p
!= partial
.rend()) {
499 ldout(cct
, 20) << "assemble_result(" << this << ") " << p
->first
<< "~" << p
->second
.second
500 << " " << p
->second
.first
.length() << " bytes"
502 ceph_assert(p
->first
== end
- p
->second
.second
);
505 size_t len
= p
->second
.first
.length();
506 ceph_assert(curr
>= p
->second
.second
);
507 curr
-= p
->second
.second
;
508 if (len
< p
->second
.second
) {
510 p
->second
.first
.begin().copy(len
, buffer
+ curr
);
511 // FIPS zeroization audit 20191117: this memset is not security related.
512 memset(buffer
+ curr
+ len
, 0, p
->second
.second
- len
);
514 p
->second
.first
.begin().copy(len
, buffer
+ curr
);
519 ceph_assert(curr
== 0);
522 uint64_t Striper::StripedReadResult::assemble_result(
523 CephContext
*cct
, std::map
<uint64_t, uint64_t> *extent_map
,
526 ldout(cct
, 10) << "assemble_result(" << this << ")" << dendl
;
527 for (auto& p
: partial
) {
528 uint64_t off
= p
.first
;
529 uint64_t len
= p
.second
.first
.length();
531 (*extent_map
)[off
] = len
;
532 bl
->claim_append(p
.second
.first
);
536 return total_intended_len
;