]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "libradosstriper/RadosStriperImpl.h" | |
16 | ||
17 | #include <errno.h> | |
18 | ||
19 | #include <sstream> | |
20 | #include <iomanip> | |
21 | #include <algorithm> | |
22 | ||
23 | #include "include/types.h" | |
24 | #include "include/uuid.h" | |
25 | #include "include/ceph_fs.h" | |
26 | #include "common/dout.h" | |
27 | #include "common/strtol.h" | |
28 | #include "osdc/Striper.h" | |
29 | #include "libradosstriper/MultiAioCompletionImpl.h" | |
30 | #include "librados/AioCompletionImpl.h" | |
31 | #include <cls/lock/cls_lock_client.h> | |
32 | ||
33 | /* | |
34 | * This file contents the actual implementation of the rados striped objects interface. | |
35 | * | |
36 | * Striped objects are stored in rados in a set of regular rados objects, after their | |
37 | * content has been striped using the osdc/Striper interface. | |
38 | * | |
39 | * The external attributes of the striped object are mapped to the attributes of the | |
40 | * first underlying object. This first object has a set of extra external attributes | |
41 | * storing the layout of the striped object for future read back. These attributes are : | |
42 | * - striper.layout.object_size : the size of rados objects used. | |
43 | * Must be a multiple of striper.layout.stripe_unit | |
44 | * - striper.layout.stripe_unit : the size of a stripe unit | |
45 | * - striper.layout.stripe_count : the number of stripes used | |
46 | * - striper.size : total striped object size | |
47 | * | |
48 | * In general operations on striped objects are not atomic. | |
49 | * However, a certain number of safety guards have been put to make the interface closer | |
50 | * to atomicity : | |
51 | * - each data operation takes a shared lock on the first rados object for the | |
52 | * whole time of the operation | |
53 | * - the remove and trunc operations take an exclusive lock on the first rados object | |
54 | * for the whole time of the operation | |
55 | * This makes sure that no removal/truncation of a striped object occurs while | |
56 | * data operations are happening and vice versa. It thus makes sure that the layout | |
57 | * of a striped object does not change during data operation, which is essential for | |
58 | * data consistency. | |
59 | * | |
60 | * Still the writing to a striped object is not atomic. This means in particular that | |
61 | * the size of an object may not be in sync with its content at all times. | |
62 | * As the size is always garanteed to be updated first and in an atomic way, and as | |
63 | * sparse striped objects are supported (see below), what will typically happen is | |
64 | * that a reader that comes too soon after a write will read 0s instead of the actual | |
65 | * data. | |
66 | * | |
67 | * Note that remove handles the pieces of the striped object in reverse order, | |
68 | * so that the head object is removed last, making the completion of the deletion atomic. | |
69 | * | |
70 | * Striped objects can be sparse, typically in case data was written at the end of the | |
71 | * striped object only. In such a case, some rados objects constituing the striped object | |
72 | * may be missing. Other can be partial (only the beginning will have data) | |
73 | * When dealing with such sparse striped files, missing objects are detected and | |
74 | * considered as full of 0s. They are however not created until real data is written | |
75 | * to them. | |
76 | * | |
77 | * There are a number of missing features/improvements that could be implemented. | |
78 | * Here are some ideas : | |
79 | * - implementation of missing entry points (compared to rados) | |
80 | * In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ... | |
81 | * | |
82 | */ | |
83 | ||
84 | #define dout_subsys ceph_subsys_rados | |
85 | #undef dout_prefix | |
86 | #define dout_prefix *_dout << "libradosstriper: " | |
87 | ||
88 | /// size of xattr buffer | |
89 | #define XATTR_BUFFER_SIZE 32 | |
90 | ||
91 | /// names of the different xattr entries | |
92 | #define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit" | |
93 | #define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count" | |
94 | #define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size" | |
95 | #define XATTR_SIZE "striper.size" | |
96 | #define LOCK_PREFIX "lock." | |
97 | ||
98 | /// name of the lock used on objects to ensure layout stability during IO | |
99 | #define RADOS_LOCK_NAME "striper.lock" | |
100 | ||
101 | /// format of the extension of rados objects created for a given striped object | |
102 | #define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx" | |
103 | ||
104 | /// default object layout | |
105 | struct ceph_file_layout default_file_layout = { | |
106 | init_le32(1<<22), // fl_stripe_unit | |
107 | init_le32(1), // fl_stripe_count | |
108 | init_le32(1<<22), // fl_object_size | |
109 | init_le32(0), // fl_cas_hash | |
110 | init_le32(0), // fl_object_stripe_unit | |
111 | init_le32(-1), // fl_unused | |
112 | init_le32(-1), // fl_pg_pool | |
113 | }; | |
114 | ||
115 | ||
116 | ///////////////////////// CompletionData ///////////////////////////// | |
117 | ||
118 | libradosstriper::RadosStriperImpl::CompletionData::CompletionData | |
119 | (libradosstriper::RadosStriperImpl* striper, | |
120 | const std::string& soid, | |
121 | const std::string& lockCookie, | |
122 | librados::AioCompletionImpl *userCompletion, | |
123 | int n) : | |
124 | RefCountedObject(striper->cct(), n), | |
125 | m_striper(striper), m_soid(soid), m_lockCookie(lockCookie), m_ack(0) { | |
126 | m_striper->get(); | |
127 | if (userCompletion) { | |
128 | m_ack = new librados::IoCtxImpl::C_aio_Complete(userCompletion); | |
129 | userCompletion->io = striper->m_ioCtxImpl; | |
130 | } | |
131 | } | |
132 | ||
133 | libradosstriper::RadosStriperImpl::CompletionData::~CompletionData() { | |
134 | if (m_ack) delete m_ack; | |
135 | m_striper->put(); | |
136 | } | |
137 | ||
138 | void libradosstriper::RadosStriperImpl::CompletionData::complete(int r) { | |
139 | if (m_ack) m_ack->finish(r); | |
140 | } | |
141 | ||
142 | libradosstriper::RadosStriperImpl::ReadCompletionData::ReadCompletionData | |
143 | (libradosstriper::RadosStriperImpl* striper, | |
144 | const std::string& soid, | |
145 | const std::string& lockCookie, | |
146 | librados::AioCompletionImpl *userCompletion, | |
147 | bufferlist* bl, | |
148 | std::vector<ObjectExtent>* extents, | |
149 | std::vector<bufferlist>* resultbl, | |
150 | int n) : | |
151 | CompletionData(striper, soid, lockCookie, userCompletion, n), | |
152 | m_bl(bl), m_extents(extents), m_resultbl(resultbl), m_readRc(0), | |
153 | m_unlockCompletion(0) {} | |
154 | ||
155 | libradosstriper::RadosStriperImpl::ReadCompletionData::~ReadCompletionData() { | |
156 | m_unlockCompletion->release(); | |
157 | delete m_extents; | |
158 | delete m_resultbl; | |
159 | } | |
160 | ||
161 | void libradosstriper::RadosStriperImpl::ReadCompletionData::complete_read(int r) { | |
162 | // gather data into final buffer | |
163 | Striper::StripedReadResult readResult; | |
164 | vector<bufferlist>::iterator bit = m_resultbl->begin(); | |
165 | for (vector<ObjectExtent>::iterator eit = m_extents->begin(); | |
166 | eit != m_extents->end(); | |
167 | ++eit, ++bit) { | |
168 | readResult.add_partial_result(m_striper->cct(), *bit, eit->buffer_extents); | |
169 | } | |
170 | m_bl->clear(); | |
171 | readResult.assemble_result(m_striper->cct(), *m_bl, true); | |
172 | // Remember return code | |
173 | m_readRc = r; | |
174 | } | |
175 | ||
176 | void libradosstriper::RadosStriperImpl::ReadCompletionData::complete_unlock(int r) { | |
177 | // call parent's completion method | |
178 | // Note that we ignore the return code of the unlock as we cannot do much about it | |
179 | CompletionData::complete(m_readRc?m_readRc:m_bl->length()); | |
180 | } | |
181 | ||
182 | libradosstriper::RadosStriperImpl::WriteCompletionData::WriteCompletionData | |
183 | (libradosstriper::RadosStriperImpl* striper, | |
184 | const std::string& soid, | |
185 | const std::string& lockCookie, | |
186 | librados::AioCompletionImpl *userCompletion, | |
187 | int n) : | |
188 | CompletionData(striper, soid, lockCookie, userCompletion, n), m_safe(0), | |
189 | m_unlockCompletion(0) { | |
190 | if (userCompletion) { | |
191 | m_safe = new librados::IoCtxImpl::C_aio_Complete(userCompletion); | |
192 | } | |
193 | } | |
194 | ||
195 | libradosstriper::RadosStriperImpl::WriteCompletionData::~WriteCompletionData() { | |
196 | m_unlockCompletion->release(); | |
197 | if (m_safe) delete m_safe; | |
198 | } | |
199 | ||
200 | void libradosstriper::RadosStriperImpl::WriteCompletionData::complete_unlock(int r) { | |
201 | // call parent's completion method | |
202 | // Note that we ignore the return code of the unlock as we cannot do much about it | |
203 | CompletionData::complete(m_writeRc); | |
204 | } | |
205 | ||
206 | void libradosstriper::RadosStriperImpl::WriteCompletionData::complete_write(int r) { | |
207 | // Remember return code | |
208 | m_writeRc = r; | |
209 | } | |
210 | ||
211 | void libradosstriper::RadosStriperImpl::WriteCompletionData::safe(int r) { | |
212 | if (m_safe) m_safe->finish(r); | |
213 | } | |
214 | ||
215 | libradosstriper::RadosStriperImpl::RemoveCompletionData::RemoveCompletionData | |
216 | (libradosstriper::RadosStriperImpl* striper, | |
217 | const std::string& soid, | |
218 | const std::string& lockCookie, | |
219 | librados::AioCompletionImpl *userCompletion, | |
220 | int flags) : | |
221 | CompletionData(striper, soid, lockCookie, userCompletion), flags(flags) {} | |
222 | ||
223 | libradosstriper::RadosStriperImpl::TruncateCompletionData::TruncateCompletionData | |
224 | (libradosstriper::RadosStriperImpl* striper, | |
225 | const std::string& soid, | |
226 | uint64_t size) : | |
227 | RefCountedObject(striper->cct()), | |
228 | m_striper(striper), m_soid(soid), m_size(size) { | |
229 | m_striper->get(); | |
230 | } | |
231 | ||
232 | libradosstriper::RadosStriperImpl::TruncateCompletionData::~TruncateCompletionData() { | |
233 | m_striper->put(); | |
234 | } | |
235 | ||
236 | ///////////////////////// constructor ///////////////////////////// | |
237 | ||
238 | libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx& ioctx, librados::IoCtxImpl *ioctx_impl) : | |
239 | m_refCnt(0),lock("RadosStriper Refcont", false, false), m_radosCluster(ioctx), m_ioCtx(ioctx), m_ioCtxImpl(ioctx_impl), | |
240 | m_layout(default_file_layout) {} | |
241 | ||
242 | ///////////////////////// layout ///////////////////////////// | |
243 | ||
244 | int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit | |
245 | (unsigned int stripe_unit) | |
246 | { | |
247 | /* stripe unit must be non-zero, 64k increment */ | |
248 | if (!stripe_unit || (stripe_unit & (CEPH_MIN_STRIPE_UNIT-1))) | |
249 | return -EINVAL; | |
250 | m_layout.fl_stripe_unit = stripe_unit; | |
251 | return 0; | |
252 | } | |
253 | ||
254 | int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount | |
255 | (unsigned int stripe_count) | |
256 | { | |
257 | /* stripe count must be non-zero */ | |
258 | if (!stripe_count) | |
259 | return -EINVAL; | |
260 | m_layout.fl_stripe_count = stripe_count; | |
261 | return 0; | |
262 | } | |
263 | ||
264 | int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize | |
265 | (unsigned int object_size) | |
266 | { | |
267 | /* object size must be non-zero, 64k increment */ | |
268 | if (!object_size || (object_size & (CEPH_MIN_STRIPE_UNIT-1))) | |
269 | return -EINVAL; | |
270 | /* object size must be a multiple of stripe unit */ | |
271 | if (object_size < m_layout.fl_stripe_unit || | |
272 | object_size % m_layout.fl_stripe_unit) | |
273 | return -EINVAL; | |
274 | m_layout.fl_object_size = object_size; | |
275 | return 0; | |
276 | } | |
277 | ||
278 | ///////////////////////// xattrs ///////////////////////////// | |
279 | ||
280 | int libradosstriper::RadosStriperImpl::getxattr(const object_t& soid, | |
281 | const char *name, | |
282 | bufferlist& bl) | |
283 | { | |
284 | std::string firstObjOid = getObjectId(soid, 0); | |
285 | return m_ioCtx.getxattr(firstObjOid, name, bl); | |
286 | } | |
287 | ||
288 | int libradosstriper::RadosStriperImpl::setxattr(const object_t& soid, | |
289 | const char *name, | |
290 | bufferlist& bl) | |
291 | { | |
292 | std::string firstObjOid = getObjectId(soid, 0); | |
293 | return m_ioCtx.setxattr(firstObjOid, name, bl); | |
294 | } | |
295 | ||
296 | int libradosstriper::RadosStriperImpl::getxattrs(const object_t& soid, | |
297 | map<string, bufferlist>& attrset) | |
298 | { | |
299 | std::string firstObjOid = getObjectId(soid, 0); | |
300 | int rc = m_ioCtx.getxattrs(firstObjOid, attrset); | |
301 | if (rc) return rc; | |
302 | // cleanup internal attributes dedicated to striping and locking | |
303 | attrset.erase(XATTR_LAYOUT_STRIPE_UNIT); | |
304 | attrset.erase(XATTR_LAYOUT_STRIPE_COUNT); | |
305 | attrset.erase(XATTR_LAYOUT_OBJECT_SIZE); | |
306 | attrset.erase(XATTR_SIZE); | |
307 | attrset.erase(std::string(LOCK_PREFIX) + RADOS_LOCK_NAME); | |
308 | return rc; | |
309 | } | |
310 | ||
311 | int libradosstriper::RadosStriperImpl::rmxattr(const object_t& soid, | |
312 | const char *name) | |
313 | { | |
314 | std::string firstObjOid = getObjectId(soid, 0); | |
315 | return m_ioCtx.rmxattr(firstObjOid, name); | |
316 | } | |
317 | ||
318 | ///////////////////////// io ///////////////////////////// | |
319 | ||
320 | int libradosstriper::RadosStriperImpl::write(const std::string& soid, | |
321 | const bufferlist& bl, | |
322 | size_t len, | |
323 | uint64_t off) | |
324 | { | |
325 | // open the object. This will create it if needed, retrieve its layout | |
326 | // and size and take a shared lock on it | |
327 | ceph_file_layout layout; | |
328 | std::string lockCookie; | |
329 | int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true); | |
330 | if (rc) return rc; | |
331 | return write_in_open_object(soid, layout, lockCookie, bl, len, off); | |
332 | } | |
333 | ||
334 | int libradosstriper::RadosStriperImpl::append(const std::string& soid, | |
335 | const bufferlist& bl, | |
336 | size_t len) | |
337 | { | |
338 | // open the object. This will create it if needed, retrieve its layout | |
339 | // and size and take a shared lock on it | |
340 | ceph_file_layout layout; | |
341 | uint64_t size = len; | |
342 | std::string lockCookie; | |
343 | int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false); | |
344 | if (rc) return rc; | |
345 | return write_in_open_object(soid, layout, lockCookie, bl, len, size); | |
346 | } | |
347 | ||
348 | int libradosstriper::RadosStriperImpl::write_full(const std::string& soid, | |
349 | const bufferlist& bl) | |
350 | { | |
351 | int rc = trunc(soid, 0); | |
352 | if (rc && rc != -ENOENT) return rc; // ENOENT is obviously ok | |
353 | return write(soid, bl, bl.length(), 0); | |
354 | } | |
355 | ||
356 | int libradosstriper::RadosStriperImpl::read(const std::string& soid, | |
357 | bufferlist* bl, | |
358 | size_t len, | |
359 | uint64_t off) | |
360 | { | |
361 | // create a completion object | |
362 | librados::AioCompletionImpl c; | |
363 | // call asynchronous method | |
364 | int rc = aio_read(soid, &c, bl, len, off); | |
365 | // and wait for completion | |
366 | if (!rc) { | |
367 | // wait for completion | |
368 | c.wait_for_complete_and_cb(); | |
369 | // return result | |
370 | rc = c.get_return_value(); | |
371 | } | |
372 | return rc; | |
373 | } | |
374 | ||
375 | ///////////////////////// asynchronous io ///////////////////////////// | |
376 | ||
377 | int libradosstriper::RadosStriperImpl::aio_write(const std::string& soid, | |
378 | librados::AioCompletionImpl *c, | |
379 | const bufferlist& bl, | |
380 | size_t len, | |
381 | uint64_t off) | |
382 | { | |
383 | ceph_file_layout layout; | |
384 | std::string lockCookie; | |
385 | int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true); | |
386 | if (rc) return rc; | |
387 | return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, off); | |
388 | } | |
389 | ||
390 | int libradosstriper::RadosStriperImpl::aio_append(const std::string& soid, | |
391 | librados::AioCompletionImpl *c, | |
392 | const bufferlist& bl, | |
393 | size_t len) | |
394 | { | |
395 | ceph_file_layout layout; | |
396 | uint64_t size = len; | |
397 | std::string lockCookie; | |
398 | int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false); | |
399 | if (rc) return rc; | |
400 | // create a completion object | |
401 | return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, size); | |
402 | } | |
403 | ||
404 | int libradosstriper::RadosStriperImpl::aio_write_full(const std::string& soid, | |
405 | librados::AioCompletionImpl *c, | |
406 | const bufferlist& bl) | |
407 | { | |
408 | int rc = trunc(soid, 0); | |
409 | if (rc) return rc; | |
410 | return aio_write(soid, c, bl, bl.length(), 0); | |
411 | } | |
412 | ||
413 | static void rados_read_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg) | |
414 | { | |
415 | libradosstriper::RadosStriperImpl::ReadCompletionData *cdata = | |
416 | reinterpret_cast<libradosstriper::RadosStriperImpl::ReadCompletionData*>(arg); | |
417 | libradosstriper::MultiAioCompletionImpl *comp = | |
418 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
419 | cdata->complete_unlock(comp->rval); | |
420 | cdata->put(); | |
421 | } | |
422 | ||
423 | static void striper_read_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
424 | { | |
425 | libradosstriper::RadosStriperImpl::ReadCompletionData *cdata = | |
426 | reinterpret_cast<libradosstriper::RadosStriperImpl::ReadCompletionData*>(arg); | |
427 | // launch the async unlocking of the object | |
428 | cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion); | |
429 | // complete the read part in parallel | |
430 | libradosstriper::MultiAioCompletionImpl *comp = | |
431 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
432 | cdata->complete_read(comp->rval); | |
433 | } | |
434 | ||
435 | static void rados_req_read_safe(rados_completion_t c, void *arg) | |
436 | { | |
437 | libradosstriper::RadosStriperImpl::RadosReadCompletionData *data = | |
438 | reinterpret_cast<libradosstriper::RadosStriperImpl::RadosReadCompletionData*>(arg); | |
439 | int rc = rados_aio_get_return_value(c); | |
440 | // ENOENT means that we are dealing with a sparse file. This is fine, | |
441 | // data (0s) will be created on the fly by the rados_req_read_complete method | |
442 | if (rc == -ENOENT) rc = 0; | |
443 | libradosstriper::MultiAioCompletionImpl *multiAioComp = data->m_multiAioCompl; | |
444 | multiAioComp->safe_request(rc); | |
445 | data->put(); | |
446 | } | |
447 | ||
448 | static void rados_req_read_complete(rados_completion_t c, void *arg) | |
449 | { | |
450 | libradosstriper::RadosStriperImpl::RadosReadCompletionData *data = | |
451 | reinterpret_cast<libradosstriper::RadosStriperImpl::RadosReadCompletionData*>(arg); | |
452 | int rc = rados_aio_get_return_value(c); | |
453 | // We need to handle the case of sparse files here | |
454 | if (rc == -ENOENT) { | |
455 | // the object did not exist at all. This can happen for sparse files. | |
456 | // we consider we've read 0 bytes and it will fall into next case | |
457 | rc = 0; | |
458 | } | |
459 | if (rc >= 0 && (((uint64_t)rc) < data->m_expectedBytes)) { | |
460 | // only partial data were present in the object (or the object did not | |
461 | // even exist if we've gone through previous case). | |
462 | // This is typical of sparse file and we need to complete with 0s. | |
463 | unsigned int lenOfZeros = data->m_expectedBytes-rc; | |
464 | unsigned int existingDataToZero = min(data->m_bl->length()-rc, lenOfZeros); | |
465 | if (existingDataToZero > 0) { | |
466 | data->m_bl->zero(rc, existingDataToZero); | |
467 | } | |
468 | if (lenOfZeros > existingDataToZero) { | |
469 | ceph::bufferptr zeros(ceph::buffer::create(lenOfZeros-existingDataToZero)); | |
470 | zeros.zero(); | |
471 | data->m_bl->push_back(zeros); | |
472 | } | |
473 | rc = data->m_expectedBytes; | |
474 | } | |
475 | libradosstriper::MultiAioCompletionImpl * multiAioComp = data->m_multiAioCompl; | |
476 | multiAioComp->complete_request(rc); | |
477 | data->put(); | |
478 | } | |
479 | ||
480 | int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid, | |
481 | librados::AioCompletionImpl *c, | |
482 | bufferlist* bl, | |
483 | size_t len, | |
484 | uint64_t off) | |
485 | { | |
486 | // open the object. This will retrieve its layout and size | |
487 | // and take a shared lock on it | |
488 | ceph_file_layout layout; | |
489 | uint64_t size; | |
490 | std::string lockCookie; | |
491 | int rc = openStripedObjectForRead(soid, &layout, &size, &lockCookie); | |
492 | if (rc) return rc; | |
493 | // find out the actual number of bytes we can read | |
494 | uint64_t read_len; | |
495 | if (off >= size) { | |
496 | // nothing to read ! We are done. | |
497 | read_len = 0; | |
498 | } else { | |
499 | read_len = min(len, (size_t)(size-off)); | |
500 | } | |
501 | // get list of extents to be read from | |
502 | vector<ObjectExtent> *extents = new vector<ObjectExtent>(); | |
503 | if (read_len > 0) { | |
504 | std::string format = soid + RADOS_OBJECT_EXTENSION_FORMAT; | |
505 | file_layout_t l; | |
506 | l.from_legacy(layout); | |
507 | Striper::file_to_extents(cct(), format.c_str(), &l, off, read_len, | |
508 | 0, *extents); | |
509 | } | |
510 | ||
511 | // create a completion object and transfer ownership of extents and resultbl | |
512 | vector<bufferlist> *resultbl = new vector<bufferlist>(extents->size()); | |
513 | ReadCompletionData *cdata = new ReadCompletionData(this, soid, lockCookie, c, | |
514 | bl, extents, resultbl, 1); | |
515 | c->is_read = true; | |
516 | c->io = m_ioCtxImpl; | |
517 | // create a completion for the unlocking of the striped object at the end of the read | |
518 | librados::AioCompletion *unlock_completion = | |
519 | librados::Rados::aio_create_completion(cdata, rados_read_aio_unlock_complete, 0); | |
520 | cdata->m_unlockCompletion = unlock_completion; | |
521 | // create the multiCompletion object handling the reads | |
522 | libradosstriper::MultiAioCompletionImpl *nc = new libradosstriper::MultiAioCompletionImpl; | |
523 | nc->set_complete_callback(cdata, striper_read_aio_req_complete); | |
524 | // go through the extents | |
525 | int r = 0, i = 0; | |
526 | for (vector<ObjectExtent>::iterator p = extents->begin(); p != extents->end(); ++p) { | |
527 | // create a buffer list describing where to place data read from current extend | |
528 | bufferlist *oid_bl = &((*resultbl)[i++]); | |
529 | for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin(); | |
530 | q != p->buffer_extents.end(); | |
531 | ++q) { | |
532 | bufferlist buffer_bl; | |
533 | buffer_bl.substr_of(*bl, q->first, q->second); | |
534 | oid_bl->append(buffer_bl); | |
535 | } | |
536 | // read all extends of a given object in one go | |
537 | nc->add_request(); | |
538 | // we need 2 references on data as both rados_req_read_safe and rados_req_read_complete | |
539 | // will release one | |
540 | RadosReadCompletionData *data = new RadosReadCompletionData(nc, p->length, oid_bl, cct(), 2); | |
541 | librados::AioCompletion *rados_completion = | |
542 | librados::Rados::aio_create_completion(data, rados_req_read_complete, rados_req_read_safe); | |
543 | r = m_ioCtx.aio_read(p->oid.name, rados_completion, oid_bl, p->length, p->offset); | |
544 | rados_completion->release(); | |
545 | if (r < 0) | |
546 | break; | |
547 | } | |
548 | nc->finish_adding_requests(); | |
549 | nc->put(); | |
550 | return r; | |
551 | } | |
552 | ||
553 | int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid, | |
554 | librados::AioCompletionImpl *c, | |
555 | char* buf, | |
556 | size_t len, | |
557 | uint64_t off) | |
558 | { | |
559 | // create a buffer list and store it inside the completion object | |
560 | c->bl.clear(); | |
561 | c->bl.push_back(buffer::create_static(len, buf)); | |
562 | // call the bufferlist version of this method | |
563 | return aio_read(soid, c, &c->bl, len, off); | |
564 | } | |
565 | ||
566 | int libradosstriper::RadosStriperImpl::aio_flush() | |
567 | { | |
568 | int ret; | |
569 | // pass to the rados level | |
570 | ret = m_ioCtx.aio_flush(); | |
571 | if (ret < 0) | |
572 | return ret; | |
573 | //wait all CompletionData are released | |
574 | lock.Lock(); | |
575 | while (m_refCnt > 1) | |
576 | cond.Wait(lock); | |
577 | lock.Unlock(); | |
578 | return ret; | |
579 | } | |
580 | ||
581 | ///////////////////////// stat and deletion ///////////////////////////// | |
582 | ||
583 | int libradosstriper::RadosStriperImpl::stat(const std::string& soid, uint64_t *psize, time_t *pmtime) | |
584 | { | |
585 | // create a completion object | |
586 | librados::AioCompletionImpl c; | |
587 | // call asynchronous version of stat | |
588 | int rc = aio_stat(soid, &c, psize, pmtime); | |
589 | if (rc == 0) { | |
590 | // wait for completion of the remove | |
591 | c.wait_for_complete(); | |
592 | // get result | |
593 | rc = c.get_return_value(); | |
594 | } | |
595 | return rc; | |
596 | } | |
597 | ||
598 | static void striper_stat_aio_stat_complete(rados_completion_t c, void *arg) { | |
599 | libradosstriper::RadosStriperImpl::BasicStatCompletionData *data = | |
600 | reinterpret_cast<libradosstriper::RadosStriperImpl::BasicStatCompletionData*>(arg); | |
601 | int rc = rados_aio_get_return_value(c); | |
602 | if (rc == -ENOENT) { | |
603 | // remember this has failed | |
604 | data->m_statRC = rc; | |
605 | } | |
606 | data->m_multiCompletion->complete_request(rc); | |
607 | data->put(); | |
608 | } | |
609 | ||
610 | static void striper_stat_aio_getxattr_complete(rados_completion_t c, void *arg) { | |
611 | libradosstriper::RadosStriperImpl::BasicStatCompletionData *data = | |
612 | reinterpret_cast<libradosstriper::RadosStriperImpl::BasicStatCompletionData*>(arg); | |
613 | int rc = rados_aio_get_return_value(c); | |
614 | // We need to handle the case of sparse files here | |
615 | if (rc < 0) { | |
616 | // remember this has failed | |
617 | data->m_getxattrRC = rc; | |
618 | } else { | |
619 | // this intermediate string allows to add a null terminator before calling strtol | |
620 | std::string err; | |
621 | std::string strsize(data->m_bl.c_str(), data->m_bl.length()); | |
622 | *data->m_psize = strict_strtoll(strsize.c_str(), 10, &err); | |
623 | if (!err.empty()) { | |
624 | lderr(data->m_striper->cct()) << XATTR_SIZE << " : " << err << dendl; | |
625 | data->m_getxattrRC = -EINVAL; | |
626 | } | |
627 | rc = 0; | |
628 | } | |
629 | data->m_multiCompletion->complete_request(rc); | |
630 | data->put(); | |
631 | } | |
632 | ||
633 | static void striper_stat_aio_req_complete(rados_striper_multi_completion_t c, | |
634 | void *arg) { | |
635 | libradosstriper::RadosStriperImpl::BasicStatCompletionData *data = | |
636 | reinterpret_cast<libradosstriper::RadosStriperImpl::BasicStatCompletionData*>(arg); | |
637 | if (data->m_statRC) { | |
638 | data->complete(data->m_statRC); | |
639 | } else { | |
640 | if (data->m_getxattrRC < 0) { | |
641 | data->complete(data->m_getxattrRC); | |
642 | } else { | |
643 | data->complete(0); | |
644 | } | |
645 | } | |
646 | data->put(); | |
647 | } | |
648 | ||
649 | template<class TimeType> | |
650 | int libradosstriper::RadosStriperImpl::aio_generic_stat | |
651 | (const std::string& soid, | |
652 | librados::AioCompletionImpl *c, | |
653 | uint64_t *psize, | |
654 | TimeType *pmtime, | |
655 | typename libradosstriper::RadosStriperImpl::StatFunction<TimeType>::Type statFunction) | |
656 | { | |
657 | // use a MultiAioCompletion object for dealing with the fact | |
658 | // that we'll do 2 asynchronous calls in parallel | |
659 | libradosstriper::MultiAioCompletionImpl *multi_completion = | |
660 | new libradosstriper::MultiAioCompletionImpl; | |
661 | // Data object used for passing context to asynchronous calls | |
662 | std::string firstObjOid = getObjectId(soid, 0); | |
663 | StatCompletionData<TimeType> *cdata = | |
664 | new StatCompletionData<TimeType>(this, firstObjOid, c, | |
665 | multi_completion, psize, pmtime, 4); | |
666 | multi_completion->set_complete_callback(cdata, striper_stat_aio_req_complete); | |
667 | // use a regular AioCompletion for the stat async call | |
668 | librados::AioCompletion *stat_completion = | |
669 | librados::Rados::aio_create_completion(cdata, striper_stat_aio_stat_complete, 0); | |
670 | multi_completion->add_safe_request(); | |
671 | object_t obj(firstObjOid); | |
672 | int rc = (m_ioCtxImpl->*statFunction)(obj, stat_completion->pc, | |
673 | &cdata->m_objectSize, cdata->m_pmtime); | |
674 | stat_completion->release(); | |
675 | if (rc < 0) { | |
676 | // nothing is really started so cancel everything | |
677 | delete multi_completion; | |
678 | delete cdata; | |
679 | return rc; | |
680 | } | |
681 | // use a regular AioCompletion for the getxattr async call | |
682 | librados::AioCompletion *getxattr_completion = | |
683 | librados::Rados::aio_create_completion(cdata, striper_stat_aio_getxattr_complete, 0); | |
684 | multi_completion->add_safe_request(); | |
685 | // in parallel, get the pmsize from the first object asynchronously | |
686 | rc = m_ioCtxImpl->aio_getxattr(obj, getxattr_completion->pc, | |
687 | XATTR_SIZE, cdata->m_bl); | |
688 | getxattr_completion->release(); | |
689 | multi_completion->finish_adding_requests(); | |
690 | if (rc < 0) { | |
691 | // the async stat is ongoing, so we need to go on | |
692 | // we mark the getxattr as failed in the data object | |
693 | cdata->m_getxattrRC = rc; | |
694 | multi_completion->complete_request(rc); | |
695 | multi_completion->put(); | |
696 | return rc; | |
697 | } | |
698 | cdata->put(); | |
699 | multi_completion->put(); | |
700 | return 0; | |
701 | } | |
702 | ||
703 | int libradosstriper::RadosStriperImpl::aio_stat(const std::string& soid, | |
704 | librados::AioCompletionImpl *c, | |
705 | uint64_t *psize, | |
706 | time_t *pmtime) | |
707 | { | |
708 | return aio_generic_stat<time_t>(soid, c, psize, pmtime, &librados::IoCtxImpl::aio_stat); | |
709 | } | |
710 | ||
711 | int libradosstriper::RadosStriperImpl::stat2(const std::string& soid, uint64_t *psize, struct timespec *pts) | |
712 | { | |
713 | // create a completion object | |
714 | librados::AioCompletionImpl c; | |
715 | // call asynchronous version of stat | |
716 | int rc = aio_stat2(soid, &c, psize, pts); | |
717 | if (rc == 0) { | |
718 | // wait for completion of the remove | |
719 | c.wait_for_complete_and_cb(); | |
720 | // get result | |
721 | rc = c.get_return_value(); | |
722 | } | |
723 | return rc; | |
724 | } | |
725 | ||
726 | int libradosstriper::RadosStriperImpl::aio_stat2(const std::string& soid, | |
727 | librados::AioCompletionImpl *c, | |
728 | uint64_t *psize, | |
729 | struct timespec *pts) | |
730 | { | |
731 | return aio_generic_stat<struct timespec>(soid, c, psize, pts, &librados::IoCtxImpl::aio_stat2); | |
732 | } | |
733 | ||
734 | static void rados_req_remove_complete(rados_completion_t c, void *arg) | |
735 | { | |
736 | libradosstriper::RadosStriperImpl::RadosRemoveCompletionData *cdata = | |
737 | reinterpret_cast<libradosstriper::RadosStriperImpl::RadosRemoveCompletionData*>(arg); | |
738 | int rc = rados_aio_get_return_value(c); | |
739 | // in case the object did not exist, it means we had a sparse file, all is fine | |
740 | if (rc == -ENOENT) { | |
741 | rc = 0; | |
742 | } | |
743 | cdata->m_multiAioCompl->complete_request(rc); | |
744 | cdata->put(); | |
745 | } | |
746 | ||
747 | static void rados_req_remove_safe(rados_completion_t c, void *arg) | |
748 | { | |
749 | libradosstriper::RadosStriperImpl::RadosRemoveCompletionData *cdata = | |
750 | reinterpret_cast<libradosstriper::RadosStriperImpl::RadosRemoveCompletionData*>(arg); | |
751 | int rc = rados_aio_get_return_value(c); | |
752 | // in case the object did not exist, it means we had a sparse file, all is fine | |
753 | if (rc == -ENOENT) { | |
754 | rc = 0; | |
755 | } | |
756 | cdata->m_multiAioCompl->safe_request(rc); | |
757 | cdata->put(); | |
758 | } | |
759 | ||
760 | static void striper_remove_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
761 | { | |
762 | libradosstriper::RadosStriperImpl::RemoveCompletionData *cdata = | |
763 | reinterpret_cast<libradosstriper::RadosStriperImpl::RemoveCompletionData*>(arg); | |
764 | libradosstriper::MultiAioCompletionImpl *comp = | |
765 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
766 | ldout(cdata->m_striper->cct(), 10) | |
767 | << "RadosStriperImpl : striper_remove_aio_req_complete called for " | |
768 | << cdata->m_soid << dendl; | |
769 | int rc = comp->rval; | |
770 | if (rc == 0) { | |
771 | // All went fine, synchronously remove first object | |
772 | rc = cdata->m_striper->m_ioCtx.remove(cdata->m_striper->getObjectId(cdata->m_soid, 0), | |
773 | cdata->flags); | |
774 | } else { | |
775 | lderr(cdata->m_striper->cct()) | |
776 | << "RadosStriperImpl : deletion/truncation incomplete for " << cdata->m_soid | |
777 | << ", as errors were encountered. The file is left present but it's content " | |
778 | << " has been partially removed" | |
779 | << dendl; | |
780 | } | |
781 | cdata->complete(rc); | |
782 | cdata->put(); | |
783 | } | |
784 | ||
785 | int libradosstriper::RadosStriperImpl::remove(const std::string& soid, int flags) | |
786 | { | |
787 | // create a completion object | |
788 | librados::AioCompletionImpl c; | |
789 | // call asynchronous version of remove | |
790 | int rc = aio_remove(soid, &c, flags); | |
791 | if (rc == 0) { | |
792 | // wait for completion of the remove | |
793 | c.wait_for_complete_and_cb(); | |
794 | // get result | |
795 | rc = c.get_return_value(); | |
796 | } | |
797 | return rc; | |
798 | } | |
799 | ||
800 | int libradosstriper::RadosStriperImpl::aio_remove(const std::string& soid, | |
801 | librados::AioCompletionImpl *c, | |
802 | int flags) | |
803 | { | |
804 | // the RemoveCompletionData object will lock the given soid for the duration | |
805 | // of the removal | |
806 | std::string lockCookie = getUUID(); | |
807 | int rc = m_ioCtx.lock_exclusive(getObjectId(soid, 0), RADOS_LOCK_NAME, lockCookie, "", 0, 0); | |
808 | if (rc) return rc; | |
809 | // create CompletionData for the async remove call | |
810 | RemoveCompletionData *cdata = new RemoveCompletionData(this, soid, lockCookie, c, flags); | |
811 | libradosstriper::MultiAioCompletionImpl *multi_completion = | |
812 | new libradosstriper::MultiAioCompletionImpl; | |
813 | multi_completion->set_complete_callback(cdata, striper_remove_aio_req_complete); | |
814 | // call asynchronous internal version of remove | |
815 | ldout(cct(), 10) | |
816 | << "RadosStriperImpl : Aio_remove starting for " | |
817 | << soid << dendl; | |
818 | rc = internal_aio_remove(soid, multi_completion); | |
819 | multi_completion->put(); | |
820 | return rc; | |
821 | } | |
822 | ||
823 | int libradosstriper::RadosStriperImpl::internal_aio_remove | |
824 | (const std::string& soid, | |
825 | libradosstriper::MultiAioCompletionImpl *multi_completion, | |
826 | int flags) | |
827 | { | |
828 | std::string firstObjOid = getObjectId(soid, 0); | |
829 | try { | |
830 | // check size and get number of rados objects to delete | |
831 | uint64_t nb_objects = 0; | |
832 | bufferlist bl2; | |
833 | int rc = getxattr(soid, XATTR_SIZE, bl2); | |
834 | if (rc < 0) { | |
835 | // no object size (or not able to get it) | |
836 | // try to find the number of object "by hand" | |
837 | uint64_t psize; | |
838 | time_t pmtime; | |
839 | while (!m_ioCtx.stat(getObjectId(soid, nb_objects), &psize, &pmtime)) { | |
840 | nb_objects++; | |
841 | } | |
842 | } else { | |
843 | // count total number of rados objects in the striped object | |
844 | std::string err; | |
845 | // this intermediate string allows to add a null terminator before calling strtol | |
846 | std::string strsize(bl2.c_str(), bl2.length()); | |
847 | uint64_t size = strict_strtoll(strsize.c_str(), 10, &err); | |
848 | if (!err.empty()) { | |
849 | lderr(cct()) << XATTR_SIZE << " : " << err << dendl; | |
850 | ||
851 | return -EINVAL; | |
852 | } | |
853 | uint64_t object_size = m_layout.fl_object_size; | |
854 | uint64_t su = m_layout.fl_stripe_unit; | |
855 | uint64_t stripe_count = m_layout.fl_stripe_count; | |
856 | uint64_t nb_complete_sets = size / (object_size*stripe_count); | |
857 | uint64_t remaining_data = size % (object_size*stripe_count); | |
858 | uint64_t remaining_stripe_units = (remaining_data + su -1) / su; | |
859 | uint64_t remaining_objects = std::min(remaining_stripe_units, stripe_count); | |
860 | nb_objects = nb_complete_sets * stripe_count + remaining_objects; | |
861 | } | |
862 | // delete rados objects in reverse order | |
863 | // Note that we do not drop the first object. This one will only be dropped | |
864 | // if all other removals have been successful, and this is done in the | |
865 | // callback of the multi_completion object | |
866 | int rcr = 0; | |
867 | for (int i = nb_objects-1; i >= 1; i--) { | |
868 | multi_completion->add_request(); | |
869 | RadosRemoveCompletionData *data = | |
870 | new RadosRemoveCompletionData(multi_completion, cct()); | |
871 | librados::AioCompletion *rados_completion = | |
872 | librados::Rados::aio_create_completion(data, | |
873 | rados_req_remove_complete, | |
874 | rados_req_remove_safe); | |
875 | if (flags == 0) { | |
876 | rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion); | |
877 | } else { | |
878 | rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion, flags); | |
879 | } | |
880 | rados_completion->release(); | |
881 | if (rcr < 0 and -ENOENT != rcr) { | |
882 | lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid | |
883 | << ", as " << getObjectId(soid, i) << " could not be deleted (rc=" << rc << ")" | |
884 | << dendl; | |
885 | break; | |
886 | } | |
887 | } | |
888 | // we are over adding requests to the multi_completion object | |
889 | multi_completion->finish_adding_requests(); | |
890 | // return | |
891 | return rcr; | |
892 | } catch (ErrorCode &e) { | |
893 | // errror caught when trying to take the exclusive lock | |
894 | return e.m_code; | |
895 | } | |
896 | ||
897 | } | |
898 | ||
899 | int libradosstriper::RadosStriperImpl::trunc(const std::string& soid, uint64_t size) | |
900 | { | |
901 | // lock the object in exclusive mode | |
902 | std::string firstObjOid = getObjectId(soid, 0); | |
903 | librados::ObjectWriteOperation op; | |
904 | op.assert_exists(); | |
905 | std::string lockCookie = RadosStriperImpl::getUUID(); | |
906 | utime_t dur = utime_t(); | |
907 | rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_EXCLUSIVE, lockCookie, "", "", dur, 0); | |
908 | int rc = m_ioCtx.operate(firstObjOid, &op); | |
909 | if (rc) return rc; | |
910 | // load layout and size | |
911 | ceph_file_layout layout; | |
912 | uint64_t original_size; | |
913 | rc = internal_get_layout_and_size(firstObjOid, &layout, &original_size); | |
914 | if (!rc) { | |
915 | if (size < original_size) { | |
916 | rc = truncate(soid, original_size, size, layout); | |
917 | } else if (size > original_size) { | |
918 | rc = grow(soid, original_size, size, layout); | |
919 | } | |
920 | } | |
921 | // unlock object, ignore return code as we cannot do much | |
922 | m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie); | |
923 | // final return | |
924 | return rc; | |
925 | } | |
926 | ||
927 | ||
928 | ///////////////////////// private helpers ///////////////////////////// | |
929 | ||
930 | std::string libradosstriper::RadosStriperImpl::getObjectId(const object_t& soid, | |
931 | long long unsigned objectno) | |
932 | { | |
933 | std::ostringstream s; | |
934 | s << soid << '.' << std::setfill ('0') << std::setw(16) << std::hex << objectno; | |
935 | return s.str(); | |
936 | } | |
937 | ||
938 | void libradosstriper::RadosStriperImpl::unlockObject(const std::string& soid, | |
939 | const std::string& lockCookie) | |
940 | { | |
941 | // unlock the shared lock on the first rados object | |
942 | std::string firstObjOid = getObjectId(soid, 0); | |
943 | m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie); | |
944 | } | |
945 | ||
946 | void libradosstriper::RadosStriperImpl::aio_unlockObject(const std::string& soid, | |
947 | const std::string& lockCookie, | |
948 | librados::AioCompletion *c) | |
949 | { | |
950 | // unlock the shared lock on the first rados object | |
951 | std::string firstObjOid = getObjectId(soid, 0); | |
952 | m_ioCtx.aio_unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie, c); | |
953 | } | |
954 | ||
955 | static void rados_write_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg) | |
956 | { | |
957 | libradosstriper::RadosStriperImpl::WriteCompletionData *cdata = | |
958 | reinterpret_cast<libradosstriper::RadosStriperImpl::WriteCompletionData*>(arg); | |
959 | libradosstriper::MultiAioCompletionImpl *comp = | |
960 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
961 | cdata->complete_unlock(comp->rval); | |
962 | cdata->put(); | |
963 | } | |
964 | ||
965 | static void striper_write_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
966 | { | |
967 | libradosstriper::RadosStriperImpl::WriteCompletionData *cdata = | |
968 | reinterpret_cast<libradosstriper::RadosStriperImpl::WriteCompletionData*>(arg); | |
969 | // launch the async unlocking of the object | |
970 | cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion); | |
971 | // complete the write part in parallel | |
972 | libradosstriper::MultiAioCompletionImpl *comp = | |
973 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
974 | cdata->complete_write(comp->rval); | |
975 | cdata->put(); | |
976 | } | |
977 | ||
978 | static void striper_write_aio_req_safe(rados_striper_multi_completion_t c, void *arg) | |
979 | { | |
980 | libradosstriper::RadosStriperImpl::WriteCompletionData *cdata = | |
981 | reinterpret_cast<libradosstriper::RadosStriperImpl::WriteCompletionData*>(arg); | |
982 | libradosstriper::MultiAioCompletionImpl *comp = | |
983 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
984 | cdata->safe(comp->rval); | |
985 | cdata->put(); | |
986 | } | |
987 | ||
988 | int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string& soid, | |
989 | const ceph_file_layout& layout, | |
990 | const std::string& lockCookie, | |
991 | const bufferlist& bl, | |
992 | size_t len, | |
993 | uint64_t off) { | |
994 | // create a completion object to be passed to the callbacks of the multicompletion | |
995 | // we need 3 references as striper_write_aio_req_complete will release two and | |
996 | // striper_write_aio_req_safe will release one | |
997 | WriteCompletionData *cdata = new WriteCompletionData(this, soid, lockCookie, 0, 3); | |
998 | cdata->get(); // local ref | |
999 | // create a completion object for the unlocking of the striped object at the end of the write | |
1000 | librados::AioCompletion *unlock_completion = | |
1001 | librados::Rados::aio_create_completion(cdata, rados_write_aio_unlock_complete, 0); | |
1002 | cdata->m_unlockCompletion = unlock_completion; | |
1003 | // create the multicompletion that will handle the write completion | |
1004 | libradosstriper::MultiAioCompletionImpl *c = new libradosstriper::MultiAioCompletionImpl; | |
1005 | c->set_complete_callback(cdata, striper_write_aio_req_complete); | |
1006 | c->set_safe_callback(cdata, striper_write_aio_req_safe); | |
1007 | // call the asynchronous API | |
1008 | int rc = internal_aio_write(soid, c, bl, len, off, layout); | |
1009 | if (!rc) { | |
1010 | // wait for completion and safety of data | |
1011 | c->wait_for_complete_and_cb(); | |
1012 | c->wait_for_safe_and_cb(); | |
1013 | // wait for the unlocking | |
1014 | unlock_completion->wait_for_complete(); | |
1015 | // return result | |
1016 | rc = c->get_return_value(); | |
1017 | } | |
1018 | c->put(); | |
1019 | cdata->put(); | |
1020 | return rc; | |
1021 | } | |
1022 | ||
1023 | int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string& soid, | |
1024 | librados::AioCompletionImpl *c, | |
1025 | const ceph_file_layout& layout, | |
1026 | const std::string& lockCookie, | |
1027 | const bufferlist& bl, | |
1028 | size_t len, | |
1029 | uint64_t off) { | |
1030 | // create a completion object to be passed to the callbacks of the multicompletion | |
1031 | // we need 3 references as striper_write_aio_req_complete will release two and | |
1032 | // striper_write_aio_req_safe will release one | |
1033 | WriteCompletionData *cdata = new WriteCompletionData(this, soid, lockCookie, c, 3); | |
1034 | cdata->get(); // local ref | |
1035 | m_ioCtxImpl->get(); | |
1036 | c->io = m_ioCtxImpl; | |
1037 | // create a completion object for the unlocking of the striped object at the end of the write | |
1038 | librados::AioCompletion *unlock_completion = | |
1039 | librados::Rados::aio_create_completion(cdata, rados_write_aio_unlock_complete, 0); | |
1040 | cdata->m_unlockCompletion = unlock_completion; | |
1041 | // create the multicompletion that will handle the write completion | |
1042 | libradosstriper::MultiAioCompletionImpl *nc = new libradosstriper::MultiAioCompletionImpl; | |
1043 | nc->set_complete_callback(cdata, striper_write_aio_req_complete); | |
1044 | nc->set_safe_callback(cdata, striper_write_aio_req_safe); | |
1045 | // internal asynchronous API | |
1046 | int rc = internal_aio_write(soid, nc, bl, len, off, layout); | |
1047 | nc->put(); | |
1048 | cdata->put(); | |
1049 | return rc; | |
1050 | } | |
1051 | ||
1052 | static void rados_req_write_safe(rados_completion_t c, void *arg) | |
1053 | { | |
1054 | libradosstriper::MultiAioCompletionImpl *comp = | |
1055 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg); | |
1056 | comp->safe_request(rados_aio_get_return_value(c)); | |
1057 | } | |
1058 | ||
1059 | static void rados_req_write_complete(rados_completion_t c, void *arg) | |
1060 | { | |
1061 | libradosstriper::MultiAioCompletionImpl *comp = | |
1062 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg); | |
1063 | comp->complete_request(rados_aio_get_return_value(c)); | |
1064 | } | |
1065 | ||
1066 | int | |
1067 | libradosstriper::RadosStriperImpl::internal_aio_write(const std::string& soid, | |
1068 | libradosstriper::MultiAioCompletionImpl *c, | |
1069 | const bufferlist& bl, | |
1070 | size_t len, | |
1071 | uint64_t off, | |
1072 | const ceph_file_layout& layout) | |
1073 | { | |
1074 | int r = 0; | |
1075 | // Do not try anything if we are called with empty buffer, | |
1076 | // file_to_extents would raise an exception | |
1077 | if (len > 0) { | |
1078 | // get list of extents to be written to | |
1079 | vector<ObjectExtent> extents; | |
1080 | std::string format = soid + RADOS_OBJECT_EXTENSION_FORMAT; | |
1081 | file_layout_t l; | |
1082 | l.from_legacy(layout); | |
1083 | Striper::file_to_extents(cct(), format.c_str(), &l, off, len, 0, extents); | |
1084 | // go through the extents | |
1085 | for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) { | |
1086 | // assemble pieces of a given object into a single buffer list | |
1087 | bufferlist oid_bl; | |
1088 | for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin(); | |
1089 | q != p->buffer_extents.end(); | |
1090 | ++q) { | |
1091 | bufferlist buffer_bl; | |
1092 | buffer_bl.substr_of(bl, q->first, q->second); | |
1093 | oid_bl.append(buffer_bl); | |
1094 | } | |
1095 | // and write the object | |
1096 | c->add_request(); | |
1097 | librados::AioCompletion *rados_completion = | |
1098 | librados::Rados::aio_create_completion(c, rados_req_write_complete, rados_req_write_safe); | |
1099 | r = m_ioCtx.aio_write(p->oid.name, rados_completion, oid_bl, p->length, p->offset); | |
1100 | rados_completion->release(); | |
1101 | if (r < 0) | |
1102 | break; | |
1103 | } | |
1104 | } | |
1105 | c->finish_adding_requests(); | |
1106 | return r; | |
1107 | } | |
1108 | ||
1109 | int libradosstriper::RadosStriperImpl::extract_uint32_attr | |
1110 | (std::map<std::string, bufferlist> &attrs, | |
1111 | const std::string& key, | |
1112 | ceph_le32 *value) | |
1113 | { | |
1114 | std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key); | |
1115 | if (attrsIt != attrs.end()) { | |
1116 | // this intermediate string allows to add a null terminator before calling strtol | |
1117 | std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length()); | |
1118 | std::string err; | |
1119 | *value = strict_strtol(strvalue.c_str(), 10, &err); | |
1120 | if (!err.empty()) { | |
1121 | lderr(cct()) << key << " : " << err << dendl; | |
1122 | return -EINVAL; | |
1123 | } | |
1124 | } else { | |
1125 | return -ENOENT; | |
1126 | } | |
1127 | return 0; | |
1128 | } | |
1129 | ||
1130 | int libradosstriper::RadosStriperImpl::extract_sizet_attr | |
1131 | (std::map<std::string, bufferlist> &attrs, | |
1132 | const std::string& key, | |
1133 | size_t *value) | |
1134 | { | |
1135 | std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key); | |
1136 | if (attrsIt != attrs.end()) { | |
1137 | // this intermediate string allows to add a null terminator before calling strtol | |
1138 | std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length()); | |
1139 | std::string err; | |
1140 | *value = strict_strtoll(strvalue.c_str(), 10, &err); | |
1141 | if (!err.empty()) { | |
1142 | lderr(cct()) << key << " : " << err << dendl; | |
1143 | return -EINVAL; | |
1144 | } | |
1145 | } else { | |
1146 | return -ENOENT; | |
1147 | } | |
1148 | return 0; | |
1149 | } | |
1150 | ||
1151 | int libradosstriper::RadosStriperImpl::internal_get_layout_and_size( | |
1152 | const std::string& oid, | |
1153 | ceph_file_layout *layout, | |
1154 | uint64_t *size) | |
1155 | { | |
1156 | // get external attributes of the first rados object | |
1157 | std::map<std::string, bufferlist> attrs; | |
1158 | int rc = m_ioCtx.getxattrs(oid, attrs); | |
1159 | if (rc) return rc; | |
1160 | // deal with stripe_unit | |
1161 | rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_UNIT, &layout->fl_stripe_unit); | |
1162 | if (rc) return rc; | |
1163 | // deal with stripe_count | |
1164 | rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_COUNT, &layout->fl_stripe_count); | |
1165 | if (rc) return rc; | |
1166 | // deal with object_size | |
1167 | rc = extract_uint32_attr(attrs, XATTR_LAYOUT_OBJECT_SIZE, &layout->fl_object_size); | |
1168 | if (rc) return rc; | |
1169 | // deal with size | |
1170 | size_t ssize; | |
1171 | rc = extract_sizet_attr(attrs, XATTR_SIZE, &ssize); | |
1172 | if (rc) { | |
1173 | return rc; | |
1174 | } | |
1175 | *size = ssize; | |
1176 | // make valgrind happy by setting unused fl_pg_pool | |
1177 | layout->fl_pg_pool = 0; | |
1178 | return 0; | |
1179 | } | |
1180 | ||
1181 | int libradosstriper::RadosStriperImpl::openStripedObjectForRead( | |
1182 | const std::string& soid, | |
1183 | ceph_file_layout *layout, | |
1184 | uint64_t *size, | |
1185 | std::string *lockCookie) | |
1186 | { | |
1187 | // take a lock the first rados object, if it exists and gets its size | |
1188 | // check, lock and size reading must be atomic and are thus done within a single operation | |
1189 | librados::ObjectWriteOperation op; | |
1190 | op.assert_exists(); | |
1191 | *lockCookie = getUUID(); | |
1192 | utime_t dur = utime_t(); | |
1193 | rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_SHARED, *lockCookie, "Tag", "", dur, 0); | |
1194 | std::string firstObjOid = getObjectId(soid, 0); | |
1195 | int rc = m_ioCtx.operate(firstObjOid, &op); | |
1196 | if (rc) { | |
1197 | // error case (including -ENOENT) | |
1198 | return rc; | |
1199 | } | |
1200 | rc = internal_get_layout_and_size(firstObjOid, layout, size); | |
1201 | if (rc) { | |
1202 | unlockObject(soid, *lockCookie); | |
1203 | lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : " | |
1204 | << "could not load layout and size for " | |
1205 | << soid << " : rc = " << rc << dendl; | |
1206 | } | |
1207 | return rc; | |
1208 | } | |
1209 | ||
1210 | int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string& soid, | |
1211 | ceph_file_layout *layout, | |
1212 | uint64_t *size, | |
1213 | std::string *lockCookie, | |
1214 | bool isFileSizeAbsolute) | |
1215 | { | |
1216 | // take a lock the first rados object, if it exists | |
1217 | // check and lock must be atomic and are thus done within a single operation | |
1218 | librados::ObjectWriteOperation op; | |
1219 | op.assert_exists(); | |
1220 | *lockCookie = getUUID(); | |
1221 | utime_t dur = utime_t(); | |
1222 | rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_SHARED, *lockCookie, "Tag", "", dur, 0); | |
1223 | std::string firstObjOid = getObjectId(soid, 0); | |
1224 | int rc = m_ioCtx.operate(firstObjOid, &op); | |
1225 | if (rc) { | |
1226 | if (rc == -ENOENT) { | |
1227 | // object does not exist, delegate to createEmptyStripedObject | |
1228 | int rc = createAndOpenStripedObject(soid, layout, *size, lockCookie, isFileSizeAbsolute); | |
1229 | // return original size | |
1230 | *size = 0; | |
1231 | return rc; | |
1232 | } else { | |
1233 | return rc; | |
1234 | } | |
1235 | } | |
1236 | // all fine | |
1237 | uint64_t curSize; | |
1238 | rc = internal_get_layout_and_size(firstObjOid, layout, &curSize); | |
1239 | if (rc) { | |
1240 | unlockObject(soid, *lockCookie); | |
1241 | lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : " | |
1242 | << "could not load layout and size for " | |
1243 | << soid << " : rc = " << rc << dendl; | |
1244 | return rc; | |
1245 | } | |
1246 | // atomically update object size, only if smaller than current one | |
1247 | if (!isFileSizeAbsolute) | |
1248 | *size += curSize; | |
1249 | librados::ObjectWriteOperation writeOp; | |
1250 | writeOp.cmpxattr(XATTR_SIZE, LIBRADOS_CMPXATTR_OP_GT, *size); | |
1251 | std::ostringstream oss; | |
1252 | oss << *size; | |
1253 | bufferlist bl; | |
1254 | bl.append(oss.str()); | |
1255 | writeOp.setxattr(XATTR_SIZE, bl); | |
1256 | rc = m_ioCtx.operate(firstObjOid, &writeOp); | |
1257 | // return current size | |
1258 | *size = curSize; | |
1259 | // handle case where objectsize is already bigger than size | |
1260 | if (-ECANCELED == rc) | |
1261 | rc = 0; | |
1262 | if (rc) { | |
1263 | unlockObject(soid, *lockCookie); | |
1264 | lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : " | |
1265 | << "could not set new size for " | |
1266 | << soid << " : rc = " << rc << dendl; | |
1267 | } | |
1268 | return rc; | |
1269 | } | |
1270 | ||
1271 | int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string& soid, | |
1272 | ceph_file_layout *layout, | |
1273 | uint64_t size, | |
1274 | std::string *lockCookie, | |
1275 | bool isFileSizeAbsolute) | |
1276 | { | |
1277 | // build atomic write operation | |
1278 | librados::ObjectWriteOperation writeOp; | |
1279 | writeOp.create(true); | |
1280 | // object_size | |
1281 | std::ostringstream oss_object_size; | |
1282 | oss_object_size << m_layout.fl_object_size; | |
1283 | bufferlist bl_object_size; | |
1284 | bl_object_size.append(oss_object_size.str()); | |
1285 | writeOp.setxattr(XATTR_LAYOUT_OBJECT_SIZE, bl_object_size); | |
1286 | // stripe unit | |
1287 | std::ostringstream oss_stripe_unit; | |
1288 | oss_stripe_unit << m_layout.fl_stripe_unit; | |
1289 | bufferlist bl_stripe_unit; | |
1290 | bl_stripe_unit.append(oss_stripe_unit.str()); | |
1291 | writeOp.setxattr(XATTR_LAYOUT_STRIPE_UNIT, bl_stripe_unit); | |
1292 | // stripe count | |
1293 | std::ostringstream oss_stripe_count; | |
1294 | oss_stripe_count << m_layout.fl_stripe_count; | |
1295 | bufferlist bl_stripe_count; | |
1296 | bl_stripe_count.append(oss_stripe_count.str()); | |
1297 | writeOp.setxattr(XATTR_LAYOUT_STRIPE_COUNT, bl_stripe_count); | |
1298 | // size | |
1299 | std::ostringstream oss_size; | |
1300 | oss_size << (isFileSizeAbsolute?size:0); | |
1301 | bufferlist bl_size; | |
1302 | bl_size.append(oss_size.str()); | |
1303 | writeOp.setxattr(XATTR_SIZE, bl_size); | |
1304 | // effectively change attributes | |
1305 | std::string firstObjOid = getObjectId(soid, 0); | |
1306 | int rc = m_ioCtx.operate(firstObjOid, &writeOp); | |
1307 | // in case of error (but no EEXIST which would mean the object existed), return | |
1308 | if (rc && -EEXIST != rc) return rc; | |
1309 | // Otherwise open the object | |
1310 | uint64_t fileSize = size; | |
1311 | return openStripedObjectForWrite(soid, layout, &fileSize, lockCookie, isFileSizeAbsolute); | |
1312 | } | |
1313 | ||
1314 | static void striper_truncate_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
1315 | { | |
1316 | libradosstriper::RadosStriperImpl::TruncateCompletionData *cdata = | |
1317 | reinterpret_cast<libradosstriper::RadosStriperImpl::TruncateCompletionData*>(arg); | |
1318 | libradosstriper::MultiAioCompletionImpl *comp = | |
1319 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
1320 | if (0 == comp->rval) { | |
1321 | // all went fine, change size in the external attributes | |
1322 | std::ostringstream oss; | |
1323 | oss << cdata->m_size; | |
1324 | bufferlist bl; | |
1325 | bl.append(oss.str()); | |
1326 | cdata->m_striper->setxattr(cdata->m_soid, XATTR_SIZE, bl); | |
1327 | } | |
1328 | cdata->put(); | |
1329 | } | |
1330 | ||
1331 | int libradosstriper::RadosStriperImpl::truncate(const std::string& soid, | |
1332 | uint64_t original_size, | |
1333 | uint64_t size, | |
1334 | ceph_file_layout &layout) | |
1335 | { | |
1336 | TruncateCompletionData *cdata = new TruncateCompletionData(this, soid, size); | |
1337 | libradosstriper::MultiAioCompletionImpl *multi_completion = | |
1338 | new libradosstriper::MultiAioCompletionImpl; | |
1339 | multi_completion->set_complete_callback(cdata, striper_truncate_aio_req_complete); | |
1340 | // call asynchrous version of truncate | |
1341 | int rc = aio_truncate(soid, multi_completion, original_size, size, layout); | |
1342 | // wait for completion of the truncation | |
1343 | multi_completion->finish_adding_requests(); | |
1344 | multi_completion->wait_for_complete_and_cb(); | |
1345 | // return result | |
1346 | if (rc == 0) { | |
1347 | rc = multi_completion->get_return_value(); | |
1348 | } | |
1349 | multi_completion->put(); | |
1350 | return rc; | |
1351 | } | |
1352 | ||
1353 | int libradosstriper::RadosStriperImpl::aio_truncate | |
1354 | (const std::string& soid, | |
1355 | libradosstriper::MultiAioCompletionImpl *multi_completion, | |
1356 | uint64_t original_size, | |
1357 | uint64_t size, | |
1358 | ceph_file_layout &layout) | |
1359 | { | |
1360 | // handle the underlying rados objects. 3 cases here : | |
1361 | // -- the objects belonging to object sets entirely located | |
1362 | // before the truncation are unchanged | |
1363 | // -- the objects belonging to the object set where the | |
1364 | // truncation took place are truncated or removed | |
1365 | // -- the objects belonging to object sets entirely located | |
1366 | // after the truncation are removed | |
1367 | // Note that we do it backward and that we change the size in | |
1368 | // the external attributes only at the end. This make sure that | |
1369 | // no rados object stays behind if we remove the striped object | |
1370 | // after a truncation has failed | |
1371 | uint64_t trunc_objectsetno = size / layout.fl_object_size / layout.fl_stripe_count; | |
1372 | uint64_t last_objectsetno = original_size / layout.fl_object_size / layout.fl_stripe_count; | |
1373 | bool exists = false; | |
1374 | for (int64_t objectno = (last_objectsetno+1) * layout.fl_stripe_count-1; | |
1375 | objectno >= (int64_t)((trunc_objectsetno + 1) * layout.fl_stripe_count); | |
1376 | objectno--) { | |
1377 | // if no object existed so far, check object existence | |
1378 | if (!exists) { | |
1379 | uint64_t nb_full_object_set = objectno / layout.fl_stripe_count; | |
1380 | uint64_t object_index_in_set = objectno % layout.fl_stripe_count; | |
1381 | uint64_t set_start_off = nb_full_object_set * layout.fl_object_size * layout.fl_stripe_count; | |
1382 | uint64_t object_start_off = set_start_off + object_index_in_set * layout.fl_stripe_unit; | |
1383 | exists = (original_size > object_start_off); | |
1384 | } | |
1385 | if (exists) { | |
1386 | // remove asynchronously | |
1387 | multi_completion->add_request(); | |
1388 | RadosRemoveCompletionData *data = | |
1389 | new RadosRemoveCompletionData(multi_completion, cct()); | |
1390 | librados::AioCompletion *rados_completion = | |
1391 | librados::Rados::aio_create_completion(data, | |
1392 | rados_req_remove_complete, | |
1393 | rados_req_remove_safe); | |
1394 | int rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion); | |
1395 | rados_completion->release(); | |
1396 | // in case the object did not exist, it means we had a sparse file, all is fine | |
1397 | if (rc && rc != -ENOENT) return rc; | |
1398 | } | |
1399 | } | |
1400 | for (int64_t objectno = ((trunc_objectsetno + 1) * layout.fl_stripe_count) -1; | |
1401 | objectno >= (int64_t)(trunc_objectsetno * layout.fl_stripe_count); | |
1402 | objectno--) { | |
1403 | // if no object existed so far, check object existence | |
1404 | if (!exists) { | |
1405 | uint64_t object_start_off = ((objectno / layout.fl_stripe_count) * layout.fl_object_size) + | |
1406 | ((objectno % layout.fl_stripe_count) * layout.fl_stripe_unit); | |
1407 | exists = (original_size > object_start_off); | |
1408 | } | |
1409 | if (exists) { | |
1410 | // truncate | |
1411 | file_layout_t l; | |
1412 | l.from_legacy(layout); | |
1413 | uint64_t new_object_size = Striper::object_truncate_size(cct(), &l, objectno, size); | |
1414 | int rc; | |
1415 | if (new_object_size > 0 or 0 == objectno) { | |
1416 | // trunc is synchronous as there is no async version | |
1417 | // but note that only a single object will be truncated | |
1418 | // reducing the overload to a fixed amount | |
1419 | rc = m_ioCtx.trunc(getObjectId(soid, objectno), new_object_size); | |
1420 | } else { | |
1421 | // removes are asynchronous in order to speed up truncations of big files | |
1422 | multi_completion->add_request(); | |
1423 | RadosRemoveCompletionData *data = | |
1424 | new RadosRemoveCompletionData(multi_completion, cct()); | |
1425 | librados::AioCompletion *rados_completion = | |
1426 | librados::Rados::aio_create_completion(data, | |
1427 | rados_req_remove_complete, | |
1428 | rados_req_remove_safe); | |
1429 | rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion); | |
1430 | rados_completion->release(); | |
1431 | } | |
1432 | // in case the object did not exist, it means we had a sparse file, all is fine | |
1433 | if (rc && rc != -ENOENT) return rc; | |
1434 | } | |
1435 | } | |
1436 | return 0; | |
1437 | } | |
1438 | ||
1439 | int libradosstriper::RadosStriperImpl::grow(const std::string& soid, | |
1440 | uint64_t original_size, | |
1441 | uint64_t size, | |
1442 | ceph_file_layout &layout) | |
1443 | { | |
1444 | // handle the underlying rados objects. As we support sparse objects, | |
1445 | // we only have to change the size in the external attributes | |
1446 | std::ostringstream oss; | |
1447 | oss << size; | |
1448 | bufferlist bl; | |
1449 | bl.append(oss.str()); | |
1450 | int rc = m_ioCtx.setxattr(getObjectId(soid, 0), XATTR_SIZE, bl); | |
1451 | return rc; | |
1452 | } | |
1453 | ||
1454 | std::string libradosstriper::RadosStriperImpl::getUUID() | |
1455 | { | |
1456 | struct uuid_d uuid; | |
1457 | uuid.generate_random(); | |
1458 | char suuid[37]; | |
1459 | uuid.print(suuid); | |
1460 | return std::string(suuid); | |
1461 | } |