]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2012 Inktank | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "Striper.h" | |
16 | ||
17 | #include "include/types.h" | |
18 | #include "include/buffer.h" | |
19 | #include "osd/OSDMap.h" | |
20 | ||
21 | #include "common/config.h" | |
22 | #include "common/debug.h" | |
23 | ||
24 | #define dout_subsys ceph_subsys_striper | |
25 | #undef dout_prefix | |
26 | #define dout_prefix *_dout << "striper " | |
27 | ||
28 | ||
29 | void Striper::file_to_extents(CephContext *cct, const char *object_format, | |
30 | const file_layout_t *layout, | |
31 | uint64_t offset, uint64_t len, | |
32 | uint64_t trunc_size, | |
33 | vector<ObjectExtent>& extents, | |
34 | uint64_t buffer_offset) | |
35 | { | |
36 | map<object_t,vector<ObjectExtent> > object_extents; | |
37 | file_to_extents(cct, object_format, layout, offset, len, trunc_size, | |
38 | object_extents, buffer_offset); | |
39 | assimilate_extents(object_extents, extents); | |
40 | } | |
41 | ||
42 | void Striper::file_to_extents( | |
43 | CephContext *cct, const char *object_format, | |
44 | const file_layout_t *layout, | |
45 | uint64_t offset, uint64_t len, | |
46 | uint64_t trunc_size, | |
47 | map<object_t,vector<ObjectExtent> >& object_extents, | |
48 | uint64_t buffer_offset) | |
49 | { | |
50 | ldout(cct, 10) << "file_to_extents " << offset << "~" << len | |
51 | << " format " << object_format | |
52 | << dendl; | |
11fdf7f2 | 53 | ceph_assert(len > 0); |
7c673cae FG |
54 | |
55 | /* | |
56 | * we want only one extent per object! this means that each extent | |
57 | * we read may map into different bits of the final read | |
58 | * buffer.. hence ObjectExtent.buffer_extents | |
59 | */ | |
60 | ||
61 | __u32 object_size = layout->object_size; | |
62 | __u32 su = layout->stripe_unit; | |
63 | __u32 stripe_count = layout->stripe_count; | |
11fdf7f2 | 64 | ceph_assert(object_size >= su); |
7c673cae FG |
65 | if (stripe_count == 1) { |
66 | ldout(cct, 20) << " sc is one, reset su to os" << dendl; | |
67 | su = object_size; | |
68 | } | |
69 | uint64_t stripes_per_object = object_size / su; | |
70 | ldout(cct, 20) << " su " << su << " sc " << stripe_count << " os " | |
71 | << object_size << " stripes_per_object " << stripes_per_object | |
72 | << dendl; | |
73 | ||
74 | uint64_t cur = offset; | |
75 | uint64_t left = len; | |
76 | while (left > 0) { | |
77 | // layout into objects | |
78 | uint64_t blockno = cur / su; // which block | |
79 | // which horizontal stripe (Y) | |
80 | uint64_t stripeno = blockno / stripe_count; | |
81 | // which object in the object set (X) | |
82 | uint64_t stripepos = blockno % stripe_count; | |
83 | // which object set | |
84 | uint64_t objectsetno = stripeno / stripes_per_object; | |
85 | // object id | |
86 | uint64_t objectno = objectsetno * stripe_count + stripepos; | |
87 | ||
88 | // find oid, extent | |
89 | char buf[strlen(object_format) + 32]; | |
90 | snprintf(buf, sizeof(buf), object_format, (long long unsigned)objectno); | |
91 | object_t oid = buf; | |
92 | ||
93 | // map range into object | |
94 | uint64_t block_start = (stripeno % stripes_per_object) * su; | |
95 | uint64_t block_off = cur % su; | |
96 | uint64_t max = su - block_off; | |
97 | ||
98 | uint64_t x_offset = block_start + block_off; | |
99 | uint64_t x_len; | |
100 | if (left > max) | |
101 | x_len = max; | |
102 | else | |
103 | x_len = left; | |
104 | ||
105 | ldout(cct, 20) << " off " << cur << " blockno " << blockno << " stripeno " | |
106 | << stripeno << " stripepos " << stripepos << " objectsetno " | |
107 | << objectsetno << " objectno " << objectno | |
108 | << " block_start " << block_start << " block_off " | |
109 | << block_off << " " << x_offset << "~" << x_len | |
110 | << dendl; | |
111 | ||
112 | ObjectExtent *ex = 0; | |
113 | vector<ObjectExtent>& exv = object_extents[oid]; | |
114 | if (exv.empty() || exv.back().offset + exv.back().length != x_offset) { | |
115 | exv.resize(exv.size() + 1); | |
116 | ex = &exv.back(); | |
117 | ex->oid = oid; | |
118 | ex->objectno = objectno; | |
119 | ex->oloc = OSDMap::file_to_object_locator(*layout); | |
120 | ||
121 | ex->offset = x_offset; | |
122 | ex->length = x_len; | |
123 | ex->truncate_size = object_truncate_size(cct, layout, objectno, | |
124 | trunc_size); | |
125 | ||
126 | ldout(cct, 20) << " added new " << *ex << dendl; | |
127 | } else { | |
128 | // add to extent | |
129 | ex = &exv.back(); | |
130 | ldout(cct, 20) << " adding in to " << *ex << dendl; | |
131 | ex->length += x_len; | |
132 | } | |
133 | ex->buffer_extents.push_back(make_pair(cur - offset + buffer_offset, | |
134 | x_len)); | |
135 | ||
136 | ldout(cct, 15) << "file_to_extents " << *ex << " in " << ex->oloc | |
137 | << dendl; | |
138 | // ldout(cct, 0) << "map: ino " << ino << " oid " << ex.oid << " osd " | |
139 | // << ex.osd << " offset " << ex.offset << " len " << ex.len | |
140 | // << " ... left " << left << dendl; | |
141 | ||
142 | left -= x_len; | |
143 | cur += x_len; | |
144 | } | |
145 | } | |
146 | ||
147 | void Striper::assimilate_extents( | |
148 | map<object_t,vector<ObjectExtent> >& object_extents, | |
149 | vector<ObjectExtent>& extents) | |
150 | { | |
151 | // make final list | |
152 | for (map<object_t, vector<ObjectExtent> >::iterator it | |
153 | = object_extents.begin(); | |
154 | it != object_extents.end(); | |
155 | ++it) { | |
156 | for (vector<ObjectExtent>::iterator p = it->second.begin(); | |
157 | p != it->second.end(); | |
158 | ++p) { | |
159 | extents.push_back(*p); | |
160 | } | |
161 | } | |
162 | } | |
163 | ||
164 | void Striper::extent_to_file(CephContext *cct, file_layout_t *layout, | |
165 | uint64_t objectno, uint64_t off, uint64_t len, | |
166 | vector<pair<uint64_t, uint64_t> >& extents) | |
167 | { | |
168 | ldout(cct, 10) << "extent_to_file " << objectno << " " << off << "~" | |
169 | << len << dendl; | |
170 | ||
171 | __u32 object_size = layout->object_size; | |
172 | __u32 su = layout->stripe_unit; | |
173 | __u32 stripe_count = layout->stripe_count; | |
11fdf7f2 | 174 | ceph_assert(object_size >= su); |
7c673cae FG |
175 | uint64_t stripes_per_object = object_size / su; |
176 | ldout(cct, 20) << " stripes_per_object " << stripes_per_object << dendl; | |
177 | ||
178 | uint64_t off_in_block = off % su; | |
179 | ||
180 | extents.reserve(len / su + 1); | |
181 | ||
182 | while (len > 0) { | |
183 | uint64_t stripepos = objectno % stripe_count; | |
184 | uint64_t objectsetno = objectno / stripe_count; | |
185 | uint64_t stripeno = off / su + objectsetno * stripes_per_object; | |
186 | uint64_t blockno = stripeno * stripe_count + stripepos; | |
187 | uint64_t extent_off = blockno * su + off_in_block; | |
11fdf7f2 | 188 | uint64_t extent_len = std::min(len, su - off_in_block); |
7c673cae FG |
189 | extents.push_back(make_pair(extent_off, extent_len)); |
190 | ||
191 | ldout(cct, 20) << " object " << off << "~" << extent_len | |
192 | << " -> file " << extent_off << "~" << extent_len | |
193 | << dendl; | |
194 | ||
195 | off_in_block = 0; | |
196 | off += extent_len; | |
197 | len -= extent_len; | |
198 | } | |
199 | } | |
200 | ||
201 | uint64_t Striper::object_truncate_size(CephContext *cct, | |
202 | const file_layout_t *layout, | |
203 | uint64_t objectno, uint64_t trunc_size) | |
204 | { | |
205 | uint64_t obj_trunc_size; | |
206 | if (trunc_size == 0 || trunc_size == (uint64_t)-1) { | |
207 | obj_trunc_size = trunc_size; | |
208 | } else { | |
209 | __u32 object_size = layout->object_size; | |
210 | __u32 su = layout->stripe_unit; | |
211 | __u32 stripe_count = layout->stripe_count; | |
11fdf7f2 | 212 | ceph_assert(object_size >= su); |
7c673cae FG |
213 | uint64_t stripes_per_object = object_size / su; |
214 | ||
215 | uint64_t objectsetno = objectno / stripe_count; | |
216 | uint64_t trunc_objectsetno = trunc_size / object_size / stripe_count; | |
217 | if (objectsetno > trunc_objectsetno) | |
218 | obj_trunc_size = 0; | |
219 | else if (objectsetno < trunc_objectsetno) | |
220 | obj_trunc_size = object_size; | |
221 | else { | |
222 | uint64_t trunc_blockno = trunc_size / su; | |
223 | uint64_t trunc_stripeno = trunc_blockno / stripe_count; | |
224 | uint64_t trunc_stripepos = trunc_blockno % stripe_count; | |
225 | uint64_t trunc_objectno = trunc_objectsetno * stripe_count | |
226 | + trunc_stripepos; | |
227 | if (objectno < trunc_objectno) | |
228 | obj_trunc_size = ((trunc_stripeno % stripes_per_object) + 1) * su; | |
229 | else if (objectno > trunc_objectno) | |
230 | obj_trunc_size = (trunc_stripeno % stripes_per_object) * su; | |
231 | else | |
232 | obj_trunc_size = (trunc_stripeno % stripes_per_object) * su | |
233 | + (trunc_size % su); | |
234 | } | |
235 | } | |
236 | ldout(cct, 20) << "object_truncate_size " << objectno << " " | |
237 | << trunc_size << "->" << obj_trunc_size << dendl; | |
238 | return obj_trunc_size; | |
239 | } | |
240 | ||
241 | uint64_t Striper::get_num_objects(const file_layout_t& layout, | |
242 | uint64_t size) | |
243 | { | |
244 | __u32 stripe_unit = layout.stripe_unit; | |
245 | __u32 stripe_count = layout.stripe_count; | |
246 | uint64_t period = layout.get_period(); | |
247 | uint64_t num_periods = (size + period - 1) / period; | |
248 | uint64_t remainder_bytes = size % period; | |
249 | uint64_t remainder_objs = 0; | |
250 | if ((remainder_bytes > 0) && (remainder_bytes < (uint64_t)stripe_count | |
251 | * stripe_unit)) | |
252 | remainder_objs = stripe_count - ((remainder_bytes + stripe_unit - 1) | |
253 | / stripe_unit); | |
254 | return num_periods * stripe_count - remainder_objs; | |
255 | } | |
256 | ||
257 | // StripedReadResult | |
258 | ||
259 | void Striper::StripedReadResult::add_partial_result( | |
260 | CephContext *cct, bufferlist& bl, | |
261 | const vector<pair<uint64_t,uint64_t> >& buffer_extents) | |
262 | { | |
263 | ldout(cct, 10) << "add_partial_result(" << this << ") " << bl.length() | |
264 | << " to " << buffer_extents << dendl; | |
265 | for (vector<pair<uint64_t,uint64_t> >::const_iterator p | |
266 | = buffer_extents.begin(); | |
267 | p != buffer_extents.end(); | |
268 | ++p) { | |
269 | pair<bufferlist, uint64_t>& r = partial[p->first]; | |
11fdf7f2 | 270 | size_t actual = std::min<uint64_t>(bl.length(), p->second); |
7c673cae FG |
271 | bl.splice(0, actual, &r.first); |
272 | r.second = p->second; | |
273 | total_intended_len += r.second; | |
274 | } | |
275 | } | |
276 | ||
277 | void Striper::StripedReadResult::add_partial_sparse_result( | |
278 | CephContext *cct, bufferlist& bl, const map<uint64_t, uint64_t>& bl_map, | |
279 | uint64_t bl_off, const vector<pair<uint64_t,uint64_t> >& buffer_extents) | |
280 | { | |
281 | ldout(cct, 10) << "add_partial_sparse_result(" << this << ") " << bl.length() | |
282 | << " covering " << bl_map << " (offset " << bl_off << ")" | |
283 | << " to " << buffer_extents << dendl; | |
284 | map<uint64_t, uint64_t>::const_iterator s = bl_map.begin(); | |
285 | for (vector<pair<uint64_t,uint64_t> >::const_iterator p | |
286 | = buffer_extents.begin(); | |
287 | p != buffer_extents.end(); | |
288 | ++p) { | |
289 | uint64_t tofs = p->first; | |
11fdf7f2 | 290 | size_t tlen = p->second; |
7c673cae FG |
291 | ldout(cct, 30) << " be " << tofs << "~" << tlen << dendl; |
292 | while (tlen > 0) { | |
293 | ldout(cct, 20) << " t " << tofs << "~" << tlen | |
294 | << " bl has " << bl.length() | |
295 | << " off " << bl_off | |
296 | << dendl; | |
297 | if (s == bl_map.end()) { | |
298 | ldout(cct, 20) << " s at end" << dendl; | |
299 | pair<bufferlist, uint64_t>& r = partial[tofs]; | |
300 | r.second = tlen; | |
301 | total_intended_len += r.second; | |
302 | break; | |
303 | } | |
304 | ||
305 | ldout(cct, 30) << " s " << s->first << "~" << s->second << dendl; | |
306 | ||
307 | // skip zero-length extent | |
308 | if (s->second == 0) { | |
309 | ldout(cct, 30) << " s len 0, skipping" << dendl; | |
310 | ++s; | |
311 | continue; | |
312 | } | |
313 | ||
314 | if (s->first > bl_off) { | |
315 | // gap in sparse read result | |
316 | pair<bufferlist, uint64_t>& r = partial[tofs]; | |
11fdf7f2 | 317 | size_t gap = std::min<size_t>(s->first - bl_off, tlen); |
7c673cae FG |
318 | ldout(cct, 20) << " s gap " << gap << ", skipping" << dendl; |
319 | r.second = gap; | |
320 | total_intended_len += r.second; | |
321 | bl_off += gap; | |
322 | tofs += gap; | |
323 | tlen -= gap; | |
324 | if (tlen == 0) { | |
325 | continue; | |
326 | } | |
327 | } | |
328 | ||
11fdf7f2 | 329 | ceph_assert(s->first <= bl_off); |
7c673cae | 330 | size_t left = (s->first + s->second) - bl_off; |
11fdf7f2 | 331 | size_t actual = std::min(left, tlen); |
7c673cae FG |
332 | |
333 | if (actual > 0) { | |
334 | ldout(cct, 20) << " s has " << actual << ", copying" << dendl; | |
335 | pair<bufferlist, uint64_t>& r = partial[tofs]; | |
336 | bl.splice(0, actual, &r.first); | |
337 | r.second = actual; | |
338 | total_intended_len += r.second; | |
339 | bl_off += actual; | |
340 | tofs += actual; | |
341 | tlen -= actual; | |
342 | } | |
343 | if (actual == left) { | |
344 | ldout(cct, 30) << " s advancing" << dendl; | |
345 | ++s; | |
346 | } | |
347 | } | |
348 | } | |
349 | } | |
350 | ||
351 | void Striper::StripedReadResult::assemble_result(CephContext *cct, | |
352 | bufferlist& bl, | |
353 | bool zero_tail) | |
354 | { | |
355 | ldout(cct, 10) << "assemble_result(" << this << ") zero_tail=" << zero_tail | |
356 | << dendl; | |
11fdf7f2 TL |
357 | size_t zeros = 0; // zeros preceding current position |
358 | for (auto& p : partial) { | |
359 | size_t got = p.second.first.length(); | |
360 | size_t expect = p.second.second; | |
361 | if (got) { | |
362 | if (zeros) { | |
363 | bl.append_zero(zeros); | |
364 | zeros = 0; | |
7c673cae | 365 | } |
11fdf7f2 | 366 | bl.claim_append(p.second.first); |
7c673cae | 367 | } |
11fdf7f2 TL |
368 | zeros += expect - got; |
369 | } | |
370 | if (zero_tail && zeros) { | |
371 | bl.append_zero(zeros); | |
7c673cae FG |
372 | } |
373 | partial.clear(); | |
374 | } | |
375 | ||
376 | void Striper::StripedReadResult::assemble_result(CephContext *cct, char *buffer, size_t length) | |
377 | { | |
378 | ||
11fdf7f2 | 379 | ceph_assert(buffer && length == total_intended_len); |
7c673cae FG |
380 | |
381 | map<uint64_t,pair<bufferlist,uint64_t> >::reverse_iterator p = partial.rbegin(); | |
382 | if (p == partial.rend()) | |
383 | return; | |
384 | ||
385 | uint64_t curr = length; | |
386 | uint64_t end = p->first + p->second.second; | |
387 | while (p != partial.rend()) { | |
388 | // sanity check | |
389 | ldout(cct, 20) << "assemble_result(" << this << ") " << p->first << "~" << p->second.second | |
390 | << " " << p->second.first.length() << " bytes" | |
391 | << dendl; | |
11fdf7f2 | 392 | ceph_assert(p->first == end - p->second.second); |
7c673cae FG |
393 | end = p->first; |
394 | ||
395 | size_t len = p->second.first.length(); | |
11fdf7f2 | 396 | ceph_assert(curr >= p->second.second); |
7c673cae FG |
397 | curr -= p->second.second; |
398 | if (len < p->second.second) { | |
399 | if (len) | |
400 | p->second.first.copy(0, len, buffer + curr); | |
401 | memset(buffer + curr + len, 0, p->second.second - len); | |
402 | } else { | |
403 | p->second.first.copy(0, len, buffer + curr); | |
404 | } | |
405 | ++p; | |
406 | } | |
407 | partial.clear(); | |
11fdf7f2 | 408 | ceph_assert(curr == 0); |
7c673cae FG |
409 | } |
410 |