]> git.proxmox.com Git - ceph.git/blob - ceph/src/osdc/Striper.cc
import ceph quincy 17.2.6
[ceph.git] / ceph / src / osdc / Striper.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2012 Inktank
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "Striper.h"
16
17 #include "include/types.h"
18 #include "include/buffer.h"
19 #include "osd/OSDMap.h"
20
21 #include "common/config.h"
22 #include "common/debug.h"
23
24 #define dout_subsys ceph_subsys_striper
25 #undef dout_prefix
26 #define dout_prefix *_dout << "striper "
27
28 using std::make_pair;
29 using std::map;
30 using std::pair;
31
32 using ceph::bufferlist;
33
34 namespace {
35
36 object_t format_oid(const char* object_format, uint64_t object_no) {
37 char buf[strlen(object_format) + 32];
38 snprintf(buf, sizeof(buf), object_format, (long long unsigned)object_no);
39 return object_t(buf);
40 }
41
42 struct OrderByObject {
43 constexpr bool operator()(uint64_t object_no,
44 const striper::LightweightObjectExtent& rhs) const {
45 return object_no < rhs.object_no;
46 }
47 constexpr bool operator()(const striper::LightweightObjectExtent& lhs,
48 uint64_t object_no) const {
49 return lhs.object_no < object_no;
50 }
51 };
52
53 template <typename I>
54 void add_partial_sparse_result(
55 CephContext *cct,
56 std::map<uint64_t, std::pair<ceph::buffer::list, uint64_t> >* partial,
57 uint64_t* total_intended_len, bufferlist& bl, I* it, const I& end_it,
58 uint64_t* bl_off, uint64_t tofs, uint64_t tlen) {
59 ldout(cct, 30) << " be " << tofs << "~" << tlen << dendl;
60
61 auto& s = *it;
62 while (tlen > 0) {
63 ldout(cct, 20) << " t " << tofs << "~" << tlen
64 << " bl has " << bl.length()
65 << " off " << *bl_off << dendl;
66 if (s == end_it) {
67 ldout(cct, 20) << " s at end" << dendl;
68 auto& r = (*partial)[tofs];
69 r.second = tlen;
70 *total_intended_len += r.second;
71 break;
72 }
73
74 ldout(cct, 30) << " s " << s->first << "~" << s->second << dendl;
75
76 // skip zero-length extent
77 if (s->second == 0) {
78 ldout(cct, 30) << " s len 0, skipping" << dendl;
79 ++s;
80 continue;
81 }
82
83 if (s->first > *bl_off) {
84 // gap in sparse read result
85 pair<bufferlist, uint64_t>& r = (*partial)[tofs];
86 size_t gap = std::min<size_t>(s->first - *bl_off, tlen);
87 ldout(cct, 20) << " s gap " << gap << ", skipping" << dendl;
88 r.second = gap;
89 *total_intended_len += r.second;
90 *bl_off += gap;
91 tofs += gap;
92 tlen -= gap;
93 if (tlen == 0) {
94 continue;
95 }
96 }
97
98 ceph_assert(s->first <= *bl_off);
99 size_t left = (s->first + s->second) - *bl_off;
100 size_t actual = std::min<size_t>(left, tlen);
101
102 if (actual > 0) {
103 ldout(cct, 20) << " s has " << actual << ", copying" << dendl;
104 pair<bufferlist, uint64_t>& r = (*partial)[tofs];
105 bl.splice(0, actual, &r.first);
106 r.second = actual;
107 *total_intended_len += r.second;
108 *bl_off += actual;
109 tofs += actual;
110 tlen -= actual;
111 }
112 if (actual == left) {
113 ldout(cct, 30) << " s advancing" << dendl;
114 ++s;
115 }
116 }
117 }
118
119 } // anonymous namespace
120
121 void Striper::file_to_extents(CephContext *cct, const char *object_format,
122 const file_layout_t *layout,
123 uint64_t offset, uint64_t len,
124 uint64_t trunc_size,
125 std::vector<ObjectExtent>& extents,
126 uint64_t buffer_offset)
127 {
128 striper::LightweightObjectExtents lightweight_object_extents;
129 file_to_extents(cct, layout, offset, len, trunc_size, buffer_offset,
130 &lightweight_object_extents);
131
132 // convert lightweight object extents to heavyweight version
133 extents.reserve(lightweight_object_extents.size());
134 for (auto& lightweight_object_extent : lightweight_object_extents) {
135 auto& object_extent = extents.emplace_back(
136 object_t(format_oid(object_format, lightweight_object_extent.object_no)),
137 lightweight_object_extent.object_no,
138 lightweight_object_extent.offset, lightweight_object_extent.length,
139 lightweight_object_extent.truncate_size);
140
141 object_extent.oloc = OSDMap::file_to_object_locator(*layout);
142 object_extent.buffer_extents.reserve(
143 lightweight_object_extent.buffer_extents.size());
144 object_extent.buffer_extents.insert(
145 object_extent.buffer_extents.end(),
146 lightweight_object_extent.buffer_extents.begin(),
147 lightweight_object_extent.buffer_extents.end());
148 }
149 }
150
151 void Striper::file_to_extents(
152 CephContext *cct, const char *object_format,
153 const file_layout_t *layout,
154 uint64_t offset, uint64_t len,
155 uint64_t trunc_size,
156 map<object_t,std::vector<ObjectExtent> >& object_extents,
157 uint64_t buffer_offset)
158 {
159 striper::LightweightObjectExtents lightweight_object_extents;
160 file_to_extents(cct, layout, offset, len, trunc_size, buffer_offset,
161 &lightweight_object_extents);
162
163 // convert lightweight object extents to heavyweight version
164 for (auto& lightweight_object_extent : lightweight_object_extents) {
165 auto oid = format_oid(object_format, lightweight_object_extent.object_no);
166 auto& object_extent = object_extents[oid].emplace_back(
167 oid, lightweight_object_extent.object_no,
168 lightweight_object_extent.offset, lightweight_object_extent.length,
169 lightweight_object_extent.truncate_size);
170
171 object_extent.oloc = OSDMap::file_to_object_locator(*layout);
172 object_extent.buffer_extents.reserve(
173 lightweight_object_extent.buffer_extents.size());
174 object_extent.buffer_extents.insert(
175 object_extent.buffer_extents.end(),
176 lightweight_object_extent.buffer_extents.begin(),
177 lightweight_object_extent.buffer_extents.end());
178 }
179 }
180
181 void Striper::file_to_extents(
182 CephContext *cct, const file_layout_t *layout, uint64_t offset,
183 uint64_t len, uint64_t trunc_size, uint64_t buffer_offset,
184 striper::LightweightObjectExtents* object_extents) {
185 ldout(cct, 10) << "file_to_extents " << offset << "~" << len << dendl;
186 ceph_assert(len > 0);
187
188 /*
189 * we want only one extent per object! this means that each extent
190 * we read may map into different bits of the final read
191 * buffer.. hence buffer_extents
192 */
193
194 __u32 object_size = layout->object_size;
195 __u32 su = layout->stripe_unit;
196 __u32 stripe_count = layout->stripe_count;
197 ceph_assert(object_size >= su);
198 if (stripe_count == 1) {
199 ldout(cct, 20) << " sc is one, reset su to os" << dendl;
200 su = object_size;
201 }
202 uint64_t stripes_per_object = object_size / su;
203 ldout(cct, 20) << " su " << su << " sc " << stripe_count << " os "
204 << object_size << " stripes_per_object " << stripes_per_object
205 << dendl;
206
207 uint64_t cur = offset;
208 uint64_t left = len;
209 while (left > 0) {
210 // layout into objects
211 uint64_t blockno = cur / su; // which block
212 // which horizontal stripe (Y)
213 uint64_t stripeno = blockno / stripe_count;
214 // which object in the object set (X)
215 uint64_t stripepos = blockno % stripe_count;
216 // which object set
217 uint64_t objectsetno = stripeno / stripes_per_object;
218 // object id
219 uint64_t objectno = objectsetno * stripe_count + stripepos;
220
221 // map range into object
222 uint64_t block_start = (stripeno % stripes_per_object) * su;
223 uint64_t block_off = cur % su;
224 uint64_t max = su - block_off;
225
226 uint64_t x_offset = block_start + block_off;
227 uint64_t x_len;
228 if (left > max)
229 x_len = max;
230 else
231 x_len = left;
232
233 ldout(cct, 20) << " off " << cur << " blockno " << blockno << " stripeno "
234 << stripeno << " stripepos " << stripepos << " objectsetno "
235 << objectsetno << " objectno " << objectno
236 << " block_start " << block_start << " block_off "
237 << block_off << " " << x_offset << "~" << x_len
238 << dendl;
239
240 striper::LightweightObjectExtent* ex = nullptr;
241 auto it = std::upper_bound(object_extents->begin(), object_extents->end(),
242 objectno, OrderByObject());
243 striper::LightweightObjectExtents::reverse_iterator rev_it(it);
244 if (rev_it == object_extents->rend() ||
245 rev_it->object_no != objectno ||
246 rev_it->offset + rev_it->length != x_offset) {
247 // expect up to "stripe-width - 1" vector shifts in the worst-case
248 ex = &(*object_extents->emplace(
249 it, objectno, x_offset, x_len,
250 object_truncate_size(cct, layout, objectno, trunc_size)));
251 ldout(cct, 20) << " added new " << *ex << dendl;
252 } else {
253 ex = &(*rev_it);
254 ceph_assert(ex->offset + ex->length == x_offset);
255
256 ldout(cct, 20) << " adding in to " << *ex << dendl;
257 ex->length += x_len;
258 }
259
260 ex->buffer_extents.emplace_back(cur - offset + buffer_offset, x_len);
261
262 ldout(cct, 15) << "file_to_extents " << *ex << dendl;
263 // ldout(cct, 0) << "map: ino " << ino << " oid " << ex.oid << " osd "
264 // << ex.osd << " offset " << ex.offset << " len " << ex.len
265 // << " ... left " << left << dendl;
266
267 left -= x_len;
268 cur += x_len;
269 }
270 }
271
272 void Striper::extent_to_file(CephContext *cct, file_layout_t *layout,
273 uint64_t objectno, uint64_t off, uint64_t len,
274 std::vector<pair<uint64_t, uint64_t> >& extents)
275 {
276 ldout(cct, 10) << "extent_to_file " << objectno << " " << off << "~"
277 << len << dendl;
278
279 __u32 object_size = layout->object_size;
280 __u32 su = layout->stripe_unit;
281 __u32 stripe_count = layout->stripe_count;
282 ceph_assert(object_size >= su);
283 uint64_t stripes_per_object = object_size / su;
284 ldout(cct, 20) << " stripes_per_object " << stripes_per_object << dendl;
285
286 uint64_t off_in_block = off % su;
287
288 extents.reserve(len / su + 1);
289
290 while (len > 0) {
291 uint64_t stripepos = objectno % stripe_count;
292 uint64_t objectsetno = objectno / stripe_count;
293 uint64_t stripeno = off / su + objectsetno * stripes_per_object;
294 uint64_t blockno = stripeno * stripe_count + stripepos;
295 uint64_t extent_off = blockno * su + off_in_block;
296 uint64_t extent_len = std::min(len, su - off_in_block);
297 extents.push_back(make_pair(extent_off, extent_len));
298
299 ldout(cct, 20) << " object " << off << "~" << extent_len
300 << " -> file " << extent_off << "~" << extent_len
301 << dendl;
302
303 off_in_block = 0;
304 off += extent_len;
305 len -= extent_len;
306 }
307 }
308
309 uint64_t Striper::object_truncate_size(CephContext *cct,
310 const file_layout_t *layout,
311 uint64_t objectno, uint64_t trunc_size)
312 {
313 uint64_t obj_trunc_size;
314 if (trunc_size == 0 || trunc_size == (uint64_t)-1) {
315 obj_trunc_size = trunc_size;
316 } else {
317 __u32 object_size = layout->object_size;
318 __u32 su = layout->stripe_unit;
319 __u32 stripe_count = layout->stripe_count;
320 ceph_assert(object_size >= su);
321 uint64_t stripes_per_object = object_size / su;
322
323 uint64_t objectsetno = objectno / stripe_count;
324 uint64_t trunc_objectsetno = trunc_size / object_size / stripe_count;
325 if (objectsetno > trunc_objectsetno)
326 obj_trunc_size = 0;
327 else if (objectsetno < trunc_objectsetno)
328 obj_trunc_size = object_size;
329 else {
330 uint64_t trunc_blockno = trunc_size / su;
331 uint64_t trunc_stripeno = trunc_blockno / stripe_count;
332 uint64_t trunc_stripepos = trunc_blockno % stripe_count;
333 uint64_t trunc_objectno = trunc_objectsetno * stripe_count
334 + trunc_stripepos;
335 if (objectno < trunc_objectno)
336 obj_trunc_size = ((trunc_stripeno % stripes_per_object) + 1) * su;
337 else if (objectno > trunc_objectno)
338 obj_trunc_size = (trunc_stripeno % stripes_per_object) * su;
339 else
340 obj_trunc_size = (trunc_stripeno % stripes_per_object) * su
341 + (trunc_size % su);
342 }
343 }
344 ldout(cct, 20) << "object_truncate_size " << objectno << " "
345 << trunc_size << "->" << obj_trunc_size << dendl;
346 return obj_trunc_size;
347 }
348
349 uint64_t Striper::get_num_objects(const file_layout_t& layout,
350 uint64_t size)
351 {
352 __u32 stripe_unit = layout.stripe_unit;
353 __u32 stripe_count = layout.stripe_count;
354 uint64_t period = layout.get_period();
355 uint64_t num_periods = (size + period - 1) / period;
356 uint64_t remainder_bytes = size % period;
357 uint64_t remainder_objs = 0;
358 if ((remainder_bytes > 0) && (remainder_bytes < (uint64_t)stripe_count
359 * stripe_unit))
360 remainder_objs = stripe_count - ((remainder_bytes + stripe_unit - 1)
361 / stripe_unit);
362 return num_periods * stripe_count - remainder_objs;
363 }
364
365 uint64_t Striper::get_file_offset(CephContext *cct,
366 const file_layout_t *layout, uint64_t objectno, uint64_t off) {
367 ldout(cct, 10) << "get_file_offset " << objectno << " " << off << dendl;
368
369 __u32 object_size = layout->object_size;
370 __u32 su = layout->stripe_unit;
371 __u32 stripe_count = layout->stripe_count;
372 ceph_assert(object_size >= su);
373 uint64_t stripes_per_object = object_size / su;
374 ldout(cct, 20) << " stripes_per_object " << stripes_per_object << dendl;
375
376 uint64_t off_in_block = off % su;
377
378 uint64_t stripepos = objectno % stripe_count;
379 uint64_t objectsetno = objectno / stripe_count;
380 uint64_t stripeno = off / su + objectsetno * stripes_per_object;
381 uint64_t blockno = stripeno * stripe_count + stripepos;
382 return blockno * su + off_in_block;
383 }
384
385 // StripedReadResult
386
387 void Striper::StripedReadResult::add_partial_result(
388 CephContext *cct, bufferlist& bl,
389 const std::vector<pair<uint64_t,uint64_t> >& buffer_extents)
390 {
391 ldout(cct, 10) << "add_partial_result(" << this << ") " << bl.length()
392 << " to " << buffer_extents << dendl;
393 for (auto p = buffer_extents.cbegin(); p != buffer_extents.cend(); ++p) {
394 pair<bufferlist, uint64_t>& r = partial[p->first];
395 size_t actual = std::min<uint64_t>(bl.length(), p->second);
396 bl.splice(0, actual, &r.first);
397 r.second = p->second;
398 total_intended_len += r.second;
399 }
400 }
401
402 void Striper::StripedReadResult::add_partial_result(
403 CephContext *cct, bufferlist&& bl,
404 const striper::LightweightBufferExtents& buffer_extents)
405 {
406 ldout(cct, 10) << "add_partial_result(" << this << ") " << bl.length()
407 << " to " << buffer_extents << dendl;
408 for (auto& be : buffer_extents) {
409 auto& r = partial[be.first];
410 size_t actual = std::min<uint64_t>(bl.length(), be.second);
411 if (buffer_extents.size() == 1) {
412 r.first = std::move(bl);
413 } else {
414 bl.splice(0, actual, &r.first);
415 }
416 r.second = be.second;
417 total_intended_len += r.second;
418 }
419 }
420
421 void Striper::StripedReadResult::add_partial_sparse_result(
422 CephContext *cct, bufferlist& bl, const map<uint64_t, uint64_t>& bl_map,
423 uint64_t bl_off, const std::vector<pair<uint64_t,uint64_t> >& buffer_extents)
424 {
425 ldout(cct, 10) << "add_partial_sparse_result(" << this << ") " << bl.length()
426 << " covering " << bl_map << " (offset " << bl_off << ")"
427 << " to " << buffer_extents << dendl;
428
429 if (bl_map.empty()) {
430 add_partial_result(cct, bl, buffer_extents);
431 return;
432 }
433
434 auto s = bl_map.cbegin();
435 for (auto& be : buffer_extents) {
436 ::add_partial_sparse_result(cct, &partial, &total_intended_len, bl, &s,
437 bl_map.end(), &bl_off, be.first, be.second);
438 }
439 }
440
441 void Striper::StripedReadResult::add_partial_sparse_result(
442 CephContext *cct, ceph::buffer::list&& bl,
443 const std::vector<std::pair<uint64_t, uint64_t>>& bl_map, uint64_t bl_off,
444 const striper::LightweightBufferExtents& buffer_extents) {
445 ldout(cct, 10) << "add_partial_sparse_result(" << this << ") " << bl.length()
446 << " covering " << bl_map << " (offset " << bl_off << ")"
447 << " to " << buffer_extents << dendl;
448
449 if (bl_map.empty()) {
450 add_partial_result(cct, std::move(bl), buffer_extents);
451 return;
452 }
453
454 auto s = bl_map.cbegin();
455 for (auto& be : buffer_extents) {
456 ::add_partial_sparse_result(cct, &partial, &total_intended_len, bl, &s,
457 bl_map.cend(), &bl_off, be.first, be.second);
458 }
459 }
460
461 void Striper::StripedReadResult::assemble_result(CephContext *cct,
462 bufferlist& bl,
463 bool zero_tail)
464 {
465 ldout(cct, 10) << "assemble_result(" << this << ") zero_tail=" << zero_tail
466 << dendl;
467 size_t zeros = 0; // zeros preceding current position
468 for (auto& p : partial) {
469 size_t got = p.second.first.length();
470 size_t expect = p.second.second;
471 if (got) {
472 if (zeros) {
473 bl.append_zero(zeros);
474 zeros = 0;
475 }
476 bl.claim_append(p.second.first);
477 }
478 zeros += expect - got;
479 }
480 if (zero_tail && zeros) {
481 bl.append_zero(zeros);
482 }
483 partial.clear();
484 }
485
486 void Striper::StripedReadResult::assemble_result(CephContext *cct, char *buffer, size_t length)
487 {
488
489 ceph_assert(buffer && length == total_intended_len);
490
491 map<uint64_t,pair<bufferlist,uint64_t> >::reverse_iterator p = partial.rbegin();
492 if (p == partial.rend())
493 return;
494
495 uint64_t curr = length;
496 uint64_t end = p->first + p->second.second;
497 while (p != partial.rend()) {
498 // sanity check
499 ldout(cct, 20) << "assemble_result(" << this << ") " << p->first << "~" << p->second.second
500 << " " << p->second.first.length() << " bytes"
501 << dendl;
502 ceph_assert(p->first == end - p->second.second);
503 end = p->first;
504
505 size_t len = p->second.first.length();
506 ceph_assert(curr >= p->second.second);
507 curr -= p->second.second;
508 if (len < p->second.second) {
509 if (len)
510 p->second.first.begin().copy(len, buffer + curr);
511 // FIPS zeroization audit 20191117: this memset is not security related.
512 memset(buffer + curr + len, 0, p->second.second - len);
513 } else {
514 p->second.first.begin().copy(len, buffer + curr);
515 }
516 ++p;
517 }
518 partial.clear();
519 ceph_assert(curr == 0);
520 }
521
522 uint64_t Striper::StripedReadResult::assemble_result(
523 CephContext *cct, std::map<uint64_t, uint64_t> *extent_map,
524 bufferlist *bl)
525 {
526 ldout(cct, 10) << "assemble_result(" << this << ")" << dendl;
527 for (auto& p : partial) {
528 uint64_t off = p.first;
529 uint64_t len = p.second.first.length();
530 if (len > 0) {
531 (*extent_map)[off] = len;
532 bl->claim_append(p.second.first);
533 }
534 }
535 partial.clear();
536 return total_intended_len;
537 }