]> git.proxmox.com Git - ceph.git/blob - ceph/src/libradosstriper/RadosStriperImpl.cc
fd5f13f0065fd52df71839c407401a2042b05863
[ceph.git] / ceph / src / libradosstriper / RadosStriperImpl.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "libradosstriper/RadosStriperImpl.h"
16
17 #include <errno.h>
18
19 #include <sstream>
20 #include <iomanip>
21 #include <algorithm>
22
23 #include "include/types.h"
24 #include "include/uuid.h"
25 #include "include/ceph_fs.h"
26 #include "common/dout.h"
27 #include "common/strtol.h"
28 #include "osdc/Striper.h"
29 #include "libradosstriper/MultiAioCompletionImpl.h"
30 #include "librados/AioCompletionImpl.h"
31 #include <cls/lock/cls_lock_client.h>
32
33 /*
34 * This file contents the actual implementation of the rados striped objects interface.
35 *
36 * Striped objects are stored in rados in a set of regular rados objects, after their
37 * content has been striped using the osdc/Striper interface.
38 *
39 * The external attributes of the striped object are mapped to the attributes of the
40 * first underlying object. This first object has a set of extra external attributes
41 * storing the layout of the striped object for future read back. These attributes are :
42 * - striper.layout.object_size : the size of rados objects used.
43 * Must be a multiple of striper.layout.stripe_unit
44 * - striper.layout.stripe_unit : the size of a stripe unit
45 * - striper.layout.stripe_count : the number of stripes used
46 * - striper.size : total striped object size
47 *
48 * In general operations on striped objects are not atomic.
49 * However, a certain number of safety guards have been put to make the interface closer
50 * to atomicity :
51 * - each data operation takes a shared lock on the first rados object for the
52 * whole time of the operation
53 * - the remove and trunc operations take an exclusive lock on the first rados object
54 * for the whole time of the operation
55 * This makes sure that no removal/truncation of a striped object occurs while
56 * data operations are happening and vice versa. It thus makes sure that the layout
57 * of a striped object does not change during data operation, which is essential for
58 * data consistency.
59 *
60 * Still the writing to a striped object is not atomic. This means in particular that
61 * the size of an object may not be in sync with its content at all times.
62 * As the size is always garanteed to be updated first and in an atomic way, and as
63 * sparse striped objects are supported (see below), what will typically happen is
64 * that a reader that comes too soon after a write will read 0s instead of the actual
65 * data.
66 *
67 * Note that remove handles the pieces of the striped object in reverse order,
68 * so that the head object is removed last, making the completion of the deletion atomic.
69 *
70 * Striped objects can be sparse, typically in case data was written at the end of the
71 * striped object only. In such a case, some rados objects constituing the striped object
72 * may be missing. Other can be partial (only the beginning will have data)
73 * When dealing with such sparse striped files, missing objects are detected and
74 * considered as full of 0s. They are however not created until real data is written
75 * to them.
76 *
77 * There are a number of missing features/improvements that could be implemented.
78 * Here are some ideas :
79 * - implementation of missing entry points (compared to rados)
80 * In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ...
81 *
82 */
83
84 #define dout_subsys ceph_subsys_rados
85 #undef dout_prefix
86 #define dout_prefix *_dout << "libradosstriper: "
87
88 /// size of xattr buffer
89 #define XATTR_BUFFER_SIZE 32
90
91 /// names of the different xattr entries
92 #define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit"
93 #define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count"
94 #define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size"
95 #define XATTR_SIZE "striper.size"
96 #define LOCK_PREFIX "lock."
97
98 /// name of the lock used on objects to ensure layout stability during IO
99 #define RADOS_LOCK_NAME "striper.lock"
100
101 /// format of the extension of rados objects created for a given striped object
102 #define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx"
103
104 /// default object layout
105 struct ceph_file_layout default_file_layout = {
106 init_le32(1<<22), // fl_stripe_unit
107 init_le32(1), // fl_stripe_count
108 init_le32(1<<22), // fl_object_size
109 init_le32(0), // fl_cas_hash
110 init_le32(0), // fl_object_stripe_unit
111 init_le32(-1), // fl_unused
112 init_le32(-1), // fl_pg_pool
113 };
114
115
116 ///////////////////////// CompletionData /////////////////////////////
117
118 libradosstriper::RadosStriperImpl::CompletionData::CompletionData
119 (libradosstriper::RadosStriperImpl* striper,
120 const std::string& soid,
121 const std::string& lockCookie,
122 librados::AioCompletionImpl *userCompletion,
123 int n) :
124 RefCountedObject(striper->cct(), n),
125 m_striper(striper), m_soid(soid), m_lockCookie(lockCookie), m_ack(0) {
126 m_striper->get();
127 if (userCompletion) {
128 m_ack = new librados::IoCtxImpl::C_aio_Complete(userCompletion);
129 userCompletion->io = striper->m_ioCtxImpl;
130 }
131 }
132
133 libradosstriper::RadosStriperImpl::CompletionData::~CompletionData() {
134 if (m_ack) delete m_ack;
135 m_striper->put();
136 }
137
138 void libradosstriper::RadosStriperImpl::CompletionData::complete(int r) {
139 if (m_ack) m_ack->finish(r);
140 }
141
142 libradosstriper::RadosStriperImpl::ReadCompletionData::ReadCompletionData
143 (libradosstriper::RadosStriperImpl* striper,
144 const std::string& soid,
145 const std::string& lockCookie,
146 librados::AioCompletionImpl *userCompletion,
147 bufferlist* bl,
148 std::vector<ObjectExtent>* extents,
149 std::vector<bufferlist>* resultbl,
150 int n) :
151 CompletionData(striper, soid, lockCookie, userCompletion, n),
152 m_bl(bl), m_extents(extents), m_resultbl(resultbl), m_readRc(0),
153 m_unlockCompletion(0) {}
154
155 libradosstriper::RadosStriperImpl::ReadCompletionData::~ReadCompletionData() {
156 m_unlockCompletion->release();
157 delete m_extents;
158 delete m_resultbl;
159 }
160
161 void libradosstriper::RadosStriperImpl::ReadCompletionData::complete_read(int r) {
162 // gather data into final buffer
163 Striper::StripedReadResult readResult;
164 vector<bufferlist>::iterator bit = m_resultbl->begin();
165 for (vector<ObjectExtent>::iterator eit = m_extents->begin();
166 eit != m_extents->end();
167 ++eit, ++bit) {
168 readResult.add_partial_result(m_striper->cct(), *bit, eit->buffer_extents);
169 }
170 m_bl->clear();
171 readResult.assemble_result(m_striper->cct(), *m_bl, true);
172 // Remember return code
173 m_readRc = r;
174 }
175
176 void libradosstriper::RadosStriperImpl::ReadCompletionData::complete_unlock(int r) {
177 // call parent's completion method
178 // Note that we ignore the return code of the unlock as we cannot do much about it
179 CompletionData::complete(m_readRc?m_readRc:m_bl->length());
180 }
181
182 libradosstriper::RadosStriperImpl::WriteCompletionData::WriteCompletionData
183 (libradosstriper::RadosStriperImpl* striper,
184 const std::string& soid,
185 const std::string& lockCookie,
186 librados::AioCompletionImpl *userCompletion,
187 int n) :
188 CompletionData(striper, soid, lockCookie, userCompletion, n), m_safe(0),
189 m_unlockCompletion(0) {
190 if (userCompletion) {
191 m_safe = new librados::IoCtxImpl::C_aio_Complete(userCompletion);
192 }
193 }
194
195 libradosstriper::RadosStriperImpl::WriteCompletionData::~WriteCompletionData() {
196 m_unlockCompletion->release();
197 if (m_safe) delete m_safe;
198 }
199
200 void libradosstriper::RadosStriperImpl::WriteCompletionData::complete_unlock(int r) {
201 // call parent's completion method
202 // Note that we ignore the return code of the unlock as we cannot do much about it
203 CompletionData::complete(m_writeRc);
204 }
205
206 void libradosstriper::RadosStriperImpl::WriteCompletionData::complete_write(int r) {
207 // Remember return code
208 m_writeRc = r;
209 }
210
211 void libradosstriper::RadosStriperImpl::WriteCompletionData::safe(int r) {
212 if (m_safe) m_safe->finish(r);
213 }
214
215 libradosstriper::RadosStriperImpl::RemoveCompletionData::RemoveCompletionData
216 (libradosstriper::RadosStriperImpl* striper,
217 const std::string& soid,
218 const std::string& lockCookie,
219 librados::AioCompletionImpl *userCompletion,
220 int flags) :
221 CompletionData(striper, soid, lockCookie, userCompletion), flags(flags) {}
222
223 libradosstriper::RadosStriperImpl::TruncateCompletionData::TruncateCompletionData
224 (libradosstriper::RadosStriperImpl* striper,
225 const std::string& soid,
226 uint64_t size) :
227 RefCountedObject(striper->cct()),
228 m_striper(striper), m_soid(soid), m_size(size) {
229 m_striper->get();
230 }
231
232 libradosstriper::RadosStriperImpl::TruncateCompletionData::~TruncateCompletionData() {
233 m_striper->put();
234 }
235
236 ///////////////////////// constructor /////////////////////////////
237
238 libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx& ioctx, librados::IoCtxImpl *ioctx_impl) :
239 m_refCnt(0),lock("RadosStriper Refcont", false, false), m_radosCluster(ioctx), m_ioCtx(ioctx), m_ioCtxImpl(ioctx_impl),
240 m_layout(default_file_layout) {}
241
242 ///////////////////////// layout /////////////////////////////
243
244 int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit
245 (unsigned int stripe_unit)
246 {
247 /* stripe unit must be non-zero, 64k increment */
248 if (!stripe_unit || (stripe_unit & (CEPH_MIN_STRIPE_UNIT-1)))
249 return -EINVAL;
250 m_layout.fl_stripe_unit = stripe_unit;
251 return 0;
252 }
253
254 int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount
255 (unsigned int stripe_count)
256 {
257 /* stripe count must be non-zero */
258 if (!stripe_count)
259 return -EINVAL;
260 m_layout.fl_stripe_count = stripe_count;
261 return 0;
262 }
263
264 int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize
265 (unsigned int object_size)
266 {
267 /* object size must be non-zero, 64k increment */
268 if (!object_size || (object_size & (CEPH_MIN_STRIPE_UNIT-1)))
269 return -EINVAL;
270 /* object size must be a multiple of stripe unit */
271 if (object_size < m_layout.fl_stripe_unit ||
272 object_size % m_layout.fl_stripe_unit)
273 return -EINVAL;
274 m_layout.fl_object_size = object_size;
275 return 0;
276 }
277
278 ///////////////////////// xattrs /////////////////////////////
279
280 int libradosstriper::RadosStriperImpl::getxattr(const object_t& soid,
281 const char *name,
282 bufferlist& bl)
283 {
284 std::string firstObjOid = getObjectId(soid, 0);
285 return m_ioCtx.getxattr(firstObjOid, name, bl);
286 }
287
288 int libradosstriper::RadosStriperImpl::setxattr(const object_t& soid,
289 const char *name,
290 bufferlist& bl)
291 {
292 std::string firstObjOid = getObjectId(soid, 0);
293 return m_ioCtx.setxattr(firstObjOid, name, bl);
294 }
295
296 int libradosstriper::RadosStriperImpl::getxattrs(const object_t& soid,
297 map<string, bufferlist>& attrset)
298 {
299 std::string firstObjOid = getObjectId(soid, 0);
300 int rc = m_ioCtx.getxattrs(firstObjOid, attrset);
301 if (rc) return rc;
302 // cleanup internal attributes dedicated to striping and locking
303 attrset.erase(XATTR_LAYOUT_STRIPE_UNIT);
304 attrset.erase(XATTR_LAYOUT_STRIPE_COUNT);
305 attrset.erase(XATTR_LAYOUT_OBJECT_SIZE);
306 attrset.erase(XATTR_SIZE);
307 attrset.erase(std::string(LOCK_PREFIX) + RADOS_LOCK_NAME);
308 return rc;
309 }
310
311 int libradosstriper::RadosStriperImpl::rmxattr(const object_t& soid,
312 const char *name)
313 {
314 std::string firstObjOid = getObjectId(soid, 0);
315 return m_ioCtx.rmxattr(firstObjOid, name);
316 }
317
318 ///////////////////////// io /////////////////////////////
319
320 int libradosstriper::RadosStriperImpl::write(const std::string& soid,
321 const bufferlist& bl,
322 size_t len,
323 uint64_t off)
324 {
325 // open the object. This will create it if needed, retrieve its layout
326 // and size and take a shared lock on it
327 ceph_file_layout layout;
328 std::string lockCookie;
329 int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true);
330 if (rc) return rc;
331 return write_in_open_object(soid, layout, lockCookie, bl, len, off);
332 }
333
334 int libradosstriper::RadosStriperImpl::append(const std::string& soid,
335 const bufferlist& bl,
336 size_t len)
337 {
338 // open the object. This will create it if needed, retrieve its layout
339 // and size and take a shared lock on it
340 ceph_file_layout layout;
341 uint64_t size = len;
342 std::string lockCookie;
343 int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false);
344 if (rc) return rc;
345 return write_in_open_object(soid, layout, lockCookie, bl, len, size);
346 }
347
348 int libradosstriper::RadosStriperImpl::write_full(const std::string& soid,
349 const bufferlist& bl)
350 {
351 int rc = trunc(soid, 0);
352 if (rc && rc != -ENOENT) return rc; // ENOENT is obviously ok
353 return write(soid, bl, bl.length(), 0);
354 }
355
356 int libradosstriper::RadosStriperImpl::read(const std::string& soid,
357 bufferlist* bl,
358 size_t len,
359 uint64_t off)
360 {
361 // create a completion object
362 librados::AioCompletionImpl c;
363 // call asynchronous method
364 int rc = aio_read(soid, &c, bl, len, off);
365 // and wait for completion
366 if (!rc) {
367 // wait for completion
368 c.wait_for_complete_and_cb();
369 // return result
370 rc = c.get_return_value();
371 }
372 return rc;
373 }
374
375 ///////////////////////// asynchronous io /////////////////////////////
376
377 int libradosstriper::RadosStriperImpl::aio_write(const std::string& soid,
378 librados::AioCompletionImpl *c,
379 const bufferlist& bl,
380 size_t len,
381 uint64_t off)
382 {
383 ceph_file_layout layout;
384 std::string lockCookie;
385 int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true);
386 if (rc) return rc;
387 return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, off);
388 }
389
390 int libradosstriper::RadosStriperImpl::aio_append(const std::string& soid,
391 librados::AioCompletionImpl *c,
392 const bufferlist& bl,
393 size_t len)
394 {
395 ceph_file_layout layout;
396 uint64_t size = len;
397 std::string lockCookie;
398 int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false);
399 if (rc) return rc;
400 // create a completion object
401 return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, size);
402 }
403
404 int libradosstriper::RadosStriperImpl::aio_write_full(const std::string& soid,
405 librados::AioCompletionImpl *c,
406 const bufferlist& bl)
407 {
408 int rc = trunc(soid, 0);
409 if (rc) return rc;
410 return aio_write(soid, c, bl, bl.length(), 0);
411 }
412
413 static void rados_read_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg)
414 {
415 libradosstriper::RadosStriperImpl::ReadCompletionData *cdata =
416 reinterpret_cast<libradosstriper::RadosStriperImpl::ReadCompletionData*>(arg);
417 libradosstriper::MultiAioCompletionImpl *comp =
418 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
419 cdata->complete_unlock(comp->rval);
420 cdata->put();
421 }
422
423 static void striper_read_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
424 {
425 libradosstriper::RadosStriperImpl::ReadCompletionData *cdata =
426 reinterpret_cast<libradosstriper::RadosStriperImpl::ReadCompletionData*>(arg);
427 // launch the async unlocking of the object
428 cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion);
429 // complete the read part in parallel
430 libradosstriper::MultiAioCompletionImpl *comp =
431 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
432 cdata->complete_read(comp->rval);
433 }
434
435 static void rados_req_read_safe(rados_completion_t c, void *arg)
436 {
437 libradosstriper::RadosStriperImpl::RadosReadCompletionData *data =
438 reinterpret_cast<libradosstriper::RadosStriperImpl::RadosReadCompletionData*>(arg);
439 int rc = rados_aio_get_return_value(c);
440 // ENOENT means that we are dealing with a sparse file. This is fine,
441 // data (0s) will be created on the fly by the rados_req_read_complete method
442 if (rc == -ENOENT) rc = 0;
443 libradosstriper::MultiAioCompletionImpl *multiAioComp = data->m_multiAioCompl;
444 multiAioComp->safe_request(rc);
445 data->put();
446 }
447
448 static void rados_req_read_complete(rados_completion_t c, void *arg)
449 {
450 libradosstriper::RadosStriperImpl::RadosReadCompletionData *data =
451 reinterpret_cast<libradosstriper::RadosStriperImpl::RadosReadCompletionData*>(arg);
452 int rc = rados_aio_get_return_value(c);
453 // We need to handle the case of sparse files here
454 if (rc == -ENOENT) {
455 // the object did not exist at all. This can happen for sparse files.
456 // we consider we've read 0 bytes and it will fall into next case
457 rc = 0;
458 }
459 if (rc >= 0 && (((uint64_t)rc) < data->m_expectedBytes)) {
460 // only partial data were present in the object (or the object did not
461 // even exist if we've gone through previous case).
462 // This is typical of sparse file and we need to complete with 0s.
463 unsigned int lenOfZeros = data->m_expectedBytes-rc;
464 unsigned int existingDataToZero = min(data->m_bl->length()-rc, lenOfZeros);
465 if (existingDataToZero > 0) {
466 data->m_bl->zero(rc, existingDataToZero);
467 }
468 if (lenOfZeros > existingDataToZero) {
469 ceph::bufferptr zeros(ceph::buffer::create(lenOfZeros-existingDataToZero));
470 zeros.zero();
471 data->m_bl->push_back(zeros);
472 }
473 rc = data->m_expectedBytes;
474 }
475 libradosstriper::MultiAioCompletionImpl * multiAioComp = data->m_multiAioCompl;
476 multiAioComp->complete_request(rc);
477 data->put();
478 }
479
480 int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
481 librados::AioCompletionImpl *c,
482 bufferlist* bl,
483 size_t len,
484 uint64_t off)
485 {
486 // open the object. This will retrieve its layout and size
487 // and take a shared lock on it
488 ceph_file_layout layout;
489 uint64_t size;
490 std::string lockCookie;
491 int rc = openStripedObjectForRead(soid, &layout, &size, &lockCookie);
492 if (rc) return rc;
493 // find out the actual number of bytes we can read
494 uint64_t read_len;
495 if (off >= size) {
496 // nothing to read ! We are done.
497 read_len = 0;
498 } else {
499 read_len = min(len, (size_t)(size-off));
500 }
501 // get list of extents to be read from
502 vector<ObjectExtent> *extents = new vector<ObjectExtent>();
503 if (read_len > 0) {
504 std::string format = soid + RADOS_OBJECT_EXTENSION_FORMAT;
505 file_layout_t l;
506 l.from_legacy(layout);
507 Striper::file_to_extents(cct(), format.c_str(), &l, off, read_len,
508 0, *extents);
509 }
510
511 // create a completion object and transfer ownership of extents and resultbl
512 vector<bufferlist> *resultbl = new vector<bufferlist>(extents->size());
513 ReadCompletionData *cdata = new ReadCompletionData(this, soid, lockCookie, c,
514 bl, extents, resultbl, 1);
515 c->is_read = true;
516 c->io = m_ioCtxImpl;
517 // create a completion for the unlocking of the striped object at the end of the read
518 librados::AioCompletion *unlock_completion =
519 librados::Rados::aio_create_completion(cdata, rados_read_aio_unlock_complete, 0);
520 cdata->m_unlockCompletion = unlock_completion;
521 // create the multiCompletion object handling the reads
522 libradosstriper::MultiAioCompletionImpl *nc = new libradosstriper::MultiAioCompletionImpl;
523 nc->set_complete_callback(cdata, striper_read_aio_req_complete);
524 // go through the extents
525 int r = 0, i = 0;
526 for (vector<ObjectExtent>::iterator p = extents->begin(); p != extents->end(); ++p) {
527 // create a buffer list describing where to place data read from current extend
528 bufferlist *oid_bl = &((*resultbl)[i++]);
529 for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
530 q != p->buffer_extents.end();
531 ++q) {
532 bufferlist buffer_bl;
533 buffer_bl.substr_of(*bl, q->first, q->second);
534 oid_bl->append(buffer_bl);
535 }
536 // read all extends of a given object in one go
537 nc->add_request();
538 // we need 2 references on data as both rados_req_read_safe and rados_req_read_complete
539 // will release one
540 RadosReadCompletionData *data = new RadosReadCompletionData(nc, p->length, oid_bl, cct(), 2);
541 librados::AioCompletion *rados_completion =
542 librados::Rados::aio_create_completion(data, rados_req_read_complete, rados_req_read_safe);
543 r = m_ioCtx.aio_read(p->oid.name, rados_completion, oid_bl, p->length, p->offset);
544 rados_completion->release();
545 if (r < 0)
546 break;
547 }
548 nc->finish_adding_requests();
549 nc->put();
550 return r;
551 }
552
553 int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
554 librados::AioCompletionImpl *c,
555 char* buf,
556 size_t len,
557 uint64_t off)
558 {
559 // create a buffer list and store it inside the completion object
560 c->bl.clear();
561 c->bl.push_back(buffer::create_static(len, buf));
562 // call the bufferlist version of this method
563 return aio_read(soid, c, &c->bl, len, off);
564 }
565
566 int libradosstriper::RadosStriperImpl::aio_flush()
567 {
568 int ret;
569 // pass to the rados level
570 ret = m_ioCtx.aio_flush();
571 if (ret < 0)
572 return ret;
573 //wait all CompletionData are released
574 lock.Lock();
575 while (m_refCnt > 1)
576 cond.Wait(lock);
577 lock.Unlock();
578 return ret;
579 }
580
581 ///////////////////////// stat and deletion /////////////////////////////
582
583 int libradosstriper::RadosStriperImpl::stat(const std::string& soid, uint64_t *psize, time_t *pmtime)
584 {
585 // create a completion object
586 librados::AioCompletionImpl c;
587 // call asynchronous version of stat
588 int rc = aio_stat(soid, &c, psize, pmtime);
589 if (rc == 0) {
590 // wait for completion of the remove
591 c.wait_for_complete();
592 // get result
593 rc = c.get_return_value();
594 }
595 return rc;
596 }
597
598 static void striper_stat_aio_stat_complete(rados_completion_t c, void *arg) {
599 libradosstriper::RadosStriperImpl::BasicStatCompletionData *data =
600 reinterpret_cast<libradosstriper::RadosStriperImpl::BasicStatCompletionData*>(arg);
601 int rc = rados_aio_get_return_value(c);
602 if (rc == -ENOENT) {
603 // remember this has failed
604 data->m_statRC = rc;
605 }
606 data->m_multiCompletion->complete_request(rc);
607 data->put();
608 }
609
610 static void striper_stat_aio_getxattr_complete(rados_completion_t c, void *arg) {
611 libradosstriper::RadosStriperImpl::BasicStatCompletionData *data =
612 reinterpret_cast<libradosstriper::RadosStriperImpl::BasicStatCompletionData*>(arg);
613 int rc = rados_aio_get_return_value(c);
614 // We need to handle the case of sparse files here
615 if (rc < 0) {
616 // remember this has failed
617 data->m_getxattrRC = rc;
618 } else {
619 // this intermediate string allows to add a null terminator before calling strtol
620 std::string err;
621 std::string strsize(data->m_bl.c_str(), data->m_bl.length());
622 *data->m_psize = strict_strtoll(strsize.c_str(), 10, &err);
623 if (!err.empty()) {
624 lderr(data->m_striper->cct()) << XATTR_SIZE << " : " << err << dendl;
625 data->m_getxattrRC = -EINVAL;
626 }
627 rc = 0;
628 }
629 data->m_multiCompletion->complete_request(rc);
630 data->put();
631 }
632
633 static void striper_stat_aio_req_complete(rados_striper_multi_completion_t c,
634 void *arg) {
635 libradosstriper::RadosStriperImpl::BasicStatCompletionData *data =
636 reinterpret_cast<libradosstriper::RadosStriperImpl::BasicStatCompletionData*>(arg);
637 if (data->m_statRC) {
638 data->complete(data->m_statRC);
639 } else {
640 if (data->m_getxattrRC < 0) {
641 data->complete(data->m_getxattrRC);
642 } else {
643 data->complete(0);
644 }
645 }
646 data->put();
647 }
648
649 template<class TimeType>
650 int libradosstriper::RadosStriperImpl::aio_generic_stat
651 (const std::string& soid,
652 librados::AioCompletionImpl *c,
653 uint64_t *psize,
654 TimeType *pmtime,
655 typename libradosstriper::RadosStriperImpl::StatFunction<TimeType>::Type statFunction)
656 {
657 // use a MultiAioCompletion object for dealing with the fact
658 // that we'll do 2 asynchronous calls in parallel
659 libradosstriper::MultiAioCompletionImpl *multi_completion =
660 new libradosstriper::MultiAioCompletionImpl;
661 // Data object used for passing context to asynchronous calls
662 std::string firstObjOid = getObjectId(soid, 0);
663 StatCompletionData<TimeType> *cdata =
664 new StatCompletionData<TimeType>(this, firstObjOid, c,
665 multi_completion, psize, pmtime, 4);
666 multi_completion->set_complete_callback(cdata, striper_stat_aio_req_complete);
667 // use a regular AioCompletion for the stat async call
668 librados::AioCompletion *stat_completion =
669 librados::Rados::aio_create_completion(cdata, striper_stat_aio_stat_complete, 0);
670 multi_completion->add_safe_request();
671 object_t obj(firstObjOid);
672 int rc = (m_ioCtxImpl->*statFunction)(obj, stat_completion->pc,
673 &cdata->m_objectSize, cdata->m_pmtime);
674 stat_completion->release();
675 if (rc < 0) {
676 // nothing is really started so cancel everything
677 delete multi_completion;
678 delete cdata;
679 return rc;
680 }
681 // use a regular AioCompletion for the getxattr async call
682 librados::AioCompletion *getxattr_completion =
683 librados::Rados::aio_create_completion(cdata, striper_stat_aio_getxattr_complete, 0);
684 multi_completion->add_safe_request();
685 // in parallel, get the pmsize from the first object asynchronously
686 rc = m_ioCtxImpl->aio_getxattr(obj, getxattr_completion->pc,
687 XATTR_SIZE, cdata->m_bl);
688 getxattr_completion->release();
689 multi_completion->finish_adding_requests();
690 if (rc < 0) {
691 // the async stat is ongoing, so we need to go on
692 // we mark the getxattr as failed in the data object
693 cdata->m_getxattrRC = rc;
694 multi_completion->complete_request(rc);
695 multi_completion->put();
696 return rc;
697 }
698 cdata->put();
699 multi_completion->put();
700 return 0;
701 }
702
703 int libradosstriper::RadosStriperImpl::aio_stat(const std::string& soid,
704 librados::AioCompletionImpl *c,
705 uint64_t *psize,
706 time_t *pmtime)
707 {
708 return aio_generic_stat<time_t>(soid, c, psize, pmtime, &librados::IoCtxImpl::aio_stat);
709 }
710
711 int libradosstriper::RadosStriperImpl::stat2(const std::string& soid, uint64_t *psize, struct timespec *pts)
712 {
713 // create a completion object
714 librados::AioCompletionImpl c;
715 // call asynchronous version of stat
716 int rc = aio_stat2(soid, &c, psize, pts);
717 if (rc == 0) {
718 // wait for completion of the remove
719 c.wait_for_complete_and_cb();
720 // get result
721 rc = c.get_return_value();
722 }
723 return rc;
724 }
725
726 int libradosstriper::RadosStriperImpl::aio_stat2(const std::string& soid,
727 librados::AioCompletionImpl *c,
728 uint64_t *psize,
729 struct timespec *pts)
730 {
731 return aio_generic_stat<struct timespec>(soid, c, psize, pts, &librados::IoCtxImpl::aio_stat2);
732 }
733
734 static void rados_req_remove_complete(rados_completion_t c, void *arg)
735 {
736 libradosstriper::RadosStriperImpl::RadosRemoveCompletionData *cdata =
737 reinterpret_cast<libradosstriper::RadosStriperImpl::RadosRemoveCompletionData*>(arg);
738 int rc = rados_aio_get_return_value(c);
739 // in case the object did not exist, it means we had a sparse file, all is fine
740 if (rc == -ENOENT) {
741 rc = 0;
742 }
743 cdata->m_multiAioCompl->complete_request(rc);
744 cdata->put();
745 }
746
747 static void rados_req_remove_safe(rados_completion_t c, void *arg)
748 {
749 libradosstriper::RadosStriperImpl::RadosRemoveCompletionData *cdata =
750 reinterpret_cast<libradosstriper::RadosStriperImpl::RadosRemoveCompletionData*>(arg);
751 int rc = rados_aio_get_return_value(c);
752 // in case the object did not exist, it means we had a sparse file, all is fine
753 if (rc == -ENOENT) {
754 rc = 0;
755 }
756 cdata->m_multiAioCompl->safe_request(rc);
757 cdata->put();
758 }
759
760 static void striper_remove_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
761 {
762 libradosstriper::RadosStriperImpl::RemoveCompletionData *cdata =
763 reinterpret_cast<libradosstriper::RadosStriperImpl::RemoveCompletionData*>(arg);
764 libradosstriper::MultiAioCompletionImpl *comp =
765 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
766 ldout(cdata->m_striper->cct(), 10)
767 << "RadosStriperImpl : striper_remove_aio_req_complete called for "
768 << cdata->m_soid << dendl;
769 int rc = comp->rval;
770 if (rc == 0) {
771 // All went fine, synchronously remove first object
772 rc = cdata->m_striper->m_ioCtx.remove(cdata->m_striper->getObjectId(cdata->m_soid, 0),
773 cdata->flags);
774 } else {
775 lderr(cdata->m_striper->cct())
776 << "RadosStriperImpl : deletion/truncation incomplete for " << cdata->m_soid
777 << ", as errors were encountered. The file is left present but it's content "
778 << " has been partially removed"
779 << dendl;
780 }
781 cdata->complete(rc);
782 cdata->put();
783 }
784
785 int libradosstriper::RadosStriperImpl::remove(const std::string& soid, int flags)
786 {
787 // create a completion object
788 librados::AioCompletionImpl c;
789 // call asynchronous version of remove
790 int rc = aio_remove(soid, &c, flags);
791 if (rc == 0) {
792 // wait for completion of the remove
793 c.wait_for_complete_and_cb();
794 // get result
795 rc = c.get_return_value();
796 }
797 return rc;
798 }
799
800 int libradosstriper::RadosStriperImpl::aio_remove(const std::string& soid,
801 librados::AioCompletionImpl *c,
802 int flags)
803 {
804 // the RemoveCompletionData object will lock the given soid for the duration
805 // of the removal
806 std::string lockCookie = getUUID();
807 int rc = m_ioCtx.lock_exclusive(getObjectId(soid, 0), RADOS_LOCK_NAME, lockCookie, "", 0, 0);
808 if (rc) return rc;
809 // create CompletionData for the async remove call
810 RemoveCompletionData *cdata = new RemoveCompletionData(this, soid, lockCookie, c, flags);
811 libradosstriper::MultiAioCompletionImpl *multi_completion =
812 new libradosstriper::MultiAioCompletionImpl;
813 multi_completion->set_complete_callback(cdata, striper_remove_aio_req_complete);
814 // call asynchronous internal version of remove
815 ldout(cct(), 10)
816 << "RadosStriperImpl : Aio_remove starting for "
817 << soid << dendl;
818 rc = internal_aio_remove(soid, multi_completion);
819 multi_completion->put();
820 return rc;
821 }
822
823 int libradosstriper::RadosStriperImpl::internal_aio_remove
824 (const std::string& soid,
825 libradosstriper::MultiAioCompletionImpl *multi_completion,
826 int flags)
827 {
828 std::string firstObjOid = getObjectId(soid, 0);
829 try {
830 // check size and get number of rados objects to delete
831 uint64_t nb_objects = 0;
832 bufferlist bl2;
833 int rc = getxattr(soid, XATTR_SIZE, bl2);
834 if (rc < 0) {
835 // no object size (or not able to get it)
836 // try to find the number of object "by hand"
837 uint64_t psize;
838 time_t pmtime;
839 while (!m_ioCtx.stat(getObjectId(soid, nb_objects), &psize, &pmtime)) {
840 nb_objects++;
841 }
842 } else {
843 // count total number of rados objects in the striped object
844 std::string err;
845 // this intermediate string allows to add a null terminator before calling strtol
846 std::string strsize(bl2.c_str(), bl2.length());
847 uint64_t size = strict_strtoll(strsize.c_str(), 10, &err);
848 if (!err.empty()) {
849 lderr(cct()) << XATTR_SIZE << " : " << err << dendl;
850
851 return -EINVAL;
852 }
853 uint64_t object_size = m_layout.fl_object_size;
854 uint64_t su = m_layout.fl_stripe_unit;
855 uint64_t stripe_count = m_layout.fl_stripe_count;
856 uint64_t nb_complete_sets = size / (object_size*stripe_count);
857 uint64_t remaining_data = size % (object_size*stripe_count);
858 uint64_t remaining_stripe_units = (remaining_data + su -1) / su;
859 uint64_t remaining_objects = std::min(remaining_stripe_units, stripe_count);
860 nb_objects = nb_complete_sets * stripe_count + remaining_objects;
861 }
862 // delete rados objects in reverse order
863 // Note that we do not drop the first object. This one will only be dropped
864 // if all other removals have been successful, and this is done in the
865 // callback of the multi_completion object
866 int rcr = 0;
867 for (int i = nb_objects-1; i >= 1; i--) {
868 multi_completion->add_request();
869 RadosRemoveCompletionData *data =
870 new RadosRemoveCompletionData(multi_completion, cct());
871 librados::AioCompletion *rados_completion =
872 librados::Rados::aio_create_completion(data,
873 rados_req_remove_complete,
874 rados_req_remove_safe);
875 if (flags == 0) {
876 rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion);
877 } else {
878 rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion, flags);
879 }
880 rados_completion->release();
881 if (rcr < 0 and -ENOENT != rcr) {
882 lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid
883 << ", as " << getObjectId(soid, i) << " could not be deleted (rc=" << rc << ")"
884 << dendl;
885 break;
886 }
887 }
888 // we are over adding requests to the multi_completion object
889 multi_completion->finish_adding_requests();
890 // return
891 return rcr;
892 } catch (ErrorCode &e) {
893 // errror caught when trying to take the exclusive lock
894 return e.m_code;
895 }
896
897 }
898
899 int libradosstriper::RadosStriperImpl::trunc(const std::string& soid, uint64_t size)
900 {
901 // lock the object in exclusive mode
902 std::string firstObjOid = getObjectId(soid, 0);
903 librados::ObjectWriteOperation op;
904 op.assert_exists();
905 std::string lockCookie = RadosStriperImpl::getUUID();
906 utime_t dur = utime_t();
907 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_EXCLUSIVE, lockCookie, "", "", dur, 0);
908 int rc = m_ioCtx.operate(firstObjOid, &op);
909 if (rc) return rc;
910 // load layout and size
911 ceph_file_layout layout;
912 uint64_t original_size;
913 rc = internal_get_layout_and_size(firstObjOid, &layout, &original_size);
914 if (!rc) {
915 if (size < original_size) {
916 rc = truncate(soid, original_size, size, layout);
917 } else if (size > original_size) {
918 rc = grow(soid, original_size, size, layout);
919 }
920 }
921 // unlock object, ignore return code as we cannot do much
922 m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie);
923 // final return
924 return rc;
925 }
926
927
928 ///////////////////////// private helpers /////////////////////////////
929
930 std::string libradosstriper::RadosStriperImpl::getObjectId(const object_t& soid,
931 long long unsigned objectno)
932 {
933 std::ostringstream s;
934 s << soid << '.' << std::setfill ('0') << std::setw(16) << std::hex << objectno;
935 return s.str();
936 }
937
938 void libradosstriper::RadosStriperImpl::unlockObject(const std::string& soid,
939 const std::string& lockCookie)
940 {
941 // unlock the shared lock on the first rados object
942 std::string firstObjOid = getObjectId(soid, 0);
943 m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie);
944 }
945
946 void libradosstriper::RadosStriperImpl::aio_unlockObject(const std::string& soid,
947 const std::string& lockCookie,
948 librados::AioCompletion *c)
949 {
950 // unlock the shared lock on the first rados object
951 std::string firstObjOid = getObjectId(soid, 0);
952 m_ioCtx.aio_unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie, c);
953 }
954
955 static void rados_write_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg)
956 {
957 libradosstriper::RadosStriperImpl::WriteCompletionData *cdata =
958 reinterpret_cast<libradosstriper::RadosStriperImpl::WriteCompletionData*>(arg);
959 libradosstriper::MultiAioCompletionImpl *comp =
960 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
961 cdata->complete_unlock(comp->rval);
962 cdata->put();
963 }
964
965 static void striper_write_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
966 {
967 libradosstriper::RadosStriperImpl::WriteCompletionData *cdata =
968 reinterpret_cast<libradosstriper::RadosStriperImpl::WriteCompletionData*>(arg);
969 // launch the async unlocking of the object
970 cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion);
971 // complete the write part in parallel
972 libradosstriper::MultiAioCompletionImpl *comp =
973 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
974 cdata->complete_write(comp->rval);
975 cdata->put();
976 }
977
978 static void striper_write_aio_req_safe(rados_striper_multi_completion_t c, void *arg)
979 {
980 libradosstriper::RadosStriperImpl::WriteCompletionData *cdata =
981 reinterpret_cast<libradosstriper::RadosStriperImpl::WriteCompletionData*>(arg);
982 libradosstriper::MultiAioCompletionImpl *comp =
983 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
984 cdata->safe(comp->rval);
985 cdata->put();
986 }
987
988 int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string& soid,
989 const ceph_file_layout& layout,
990 const std::string& lockCookie,
991 const bufferlist& bl,
992 size_t len,
993 uint64_t off) {
994 // create a completion object to be passed to the callbacks of the multicompletion
995 // we need 3 references as striper_write_aio_req_complete will release two and
996 // striper_write_aio_req_safe will release one
997 WriteCompletionData *cdata = new WriteCompletionData(this, soid, lockCookie, 0, 3);
998 cdata->get(); // local ref
999 // create a completion object for the unlocking of the striped object at the end of the write
1000 librados::AioCompletion *unlock_completion =
1001 librados::Rados::aio_create_completion(cdata, rados_write_aio_unlock_complete, 0);
1002 cdata->m_unlockCompletion = unlock_completion;
1003 // create the multicompletion that will handle the write completion
1004 libradosstriper::MultiAioCompletionImpl *c = new libradosstriper::MultiAioCompletionImpl;
1005 c->set_complete_callback(cdata, striper_write_aio_req_complete);
1006 c->set_safe_callback(cdata, striper_write_aio_req_safe);
1007 // call the asynchronous API
1008 int rc = internal_aio_write(soid, c, bl, len, off, layout);
1009 if (!rc) {
1010 // wait for completion and safety of data
1011 c->wait_for_complete_and_cb();
1012 c->wait_for_safe_and_cb();
1013 // wait for the unlocking
1014 unlock_completion->wait_for_complete();
1015 // return result
1016 rc = c->get_return_value();
1017 }
1018 c->put();
1019 cdata->put();
1020 return rc;
1021 }
1022
1023 int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string& soid,
1024 librados::AioCompletionImpl *c,
1025 const ceph_file_layout& layout,
1026 const std::string& lockCookie,
1027 const bufferlist& bl,
1028 size_t len,
1029 uint64_t off) {
1030 // create a completion object to be passed to the callbacks of the multicompletion
1031 // we need 3 references as striper_write_aio_req_complete will release two and
1032 // striper_write_aio_req_safe will release one
1033 WriteCompletionData *cdata = new WriteCompletionData(this, soid, lockCookie, c, 3);
1034 cdata->get(); // local ref
1035 m_ioCtxImpl->get();
1036 c->io = m_ioCtxImpl;
1037 // create a completion object for the unlocking of the striped object at the end of the write
1038 librados::AioCompletion *unlock_completion =
1039 librados::Rados::aio_create_completion(cdata, rados_write_aio_unlock_complete, 0);
1040 cdata->m_unlockCompletion = unlock_completion;
1041 // create the multicompletion that will handle the write completion
1042 libradosstriper::MultiAioCompletionImpl *nc = new libradosstriper::MultiAioCompletionImpl;
1043 nc->set_complete_callback(cdata, striper_write_aio_req_complete);
1044 nc->set_safe_callback(cdata, striper_write_aio_req_safe);
1045 // internal asynchronous API
1046 int rc = internal_aio_write(soid, nc, bl, len, off, layout);
1047 nc->put();
1048 cdata->put();
1049 return rc;
1050 }
1051
1052 static void rados_req_write_safe(rados_completion_t c, void *arg)
1053 {
1054 libradosstriper::MultiAioCompletionImpl *comp =
1055 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg);
1056 comp->safe_request(rados_aio_get_return_value(c));
1057 }
1058
1059 static void rados_req_write_complete(rados_completion_t c, void *arg)
1060 {
1061 libradosstriper::MultiAioCompletionImpl *comp =
1062 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg);
1063 comp->complete_request(rados_aio_get_return_value(c));
1064 }
1065
1066 int
1067 libradosstriper::RadosStriperImpl::internal_aio_write(const std::string& soid,
1068 libradosstriper::MultiAioCompletionImpl *c,
1069 const bufferlist& bl,
1070 size_t len,
1071 uint64_t off,
1072 const ceph_file_layout& layout)
1073 {
1074 int r = 0;
1075 // Do not try anything if we are called with empty buffer,
1076 // file_to_extents would raise an exception
1077 if (len > 0) {
1078 // get list of extents to be written to
1079 vector<ObjectExtent> extents;
1080 std::string format = soid + RADOS_OBJECT_EXTENSION_FORMAT;
1081 file_layout_t l;
1082 l.from_legacy(layout);
1083 Striper::file_to_extents(cct(), format.c_str(), &l, off, len, 0, extents);
1084 // go through the extents
1085 for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
1086 // assemble pieces of a given object into a single buffer list
1087 bufferlist oid_bl;
1088 for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
1089 q != p->buffer_extents.end();
1090 ++q) {
1091 bufferlist buffer_bl;
1092 buffer_bl.substr_of(bl, q->first, q->second);
1093 oid_bl.append(buffer_bl);
1094 }
1095 // and write the object
1096 c->add_request();
1097 librados::AioCompletion *rados_completion =
1098 librados::Rados::aio_create_completion(c, rados_req_write_complete, rados_req_write_safe);
1099 r = m_ioCtx.aio_write(p->oid.name, rados_completion, oid_bl, p->length, p->offset);
1100 rados_completion->release();
1101 if (r < 0)
1102 break;
1103 }
1104 }
1105 c->finish_adding_requests();
1106 return r;
1107 }
1108
1109 int libradosstriper::RadosStriperImpl::extract_uint32_attr
1110 (std::map<std::string, bufferlist> &attrs,
1111 const std::string& key,
1112 ceph_le32 *value)
1113 {
1114 std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key);
1115 if (attrsIt != attrs.end()) {
1116 // this intermediate string allows to add a null terminator before calling strtol
1117 std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length());
1118 std::string err;
1119 *value = strict_strtol(strvalue.c_str(), 10, &err);
1120 if (!err.empty()) {
1121 lderr(cct()) << key << " : " << err << dendl;
1122 return -EINVAL;
1123 }
1124 } else {
1125 return -ENOENT;
1126 }
1127 return 0;
1128 }
1129
1130 int libradosstriper::RadosStriperImpl::extract_sizet_attr
1131 (std::map<std::string, bufferlist> &attrs,
1132 const std::string& key,
1133 size_t *value)
1134 {
1135 std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key);
1136 if (attrsIt != attrs.end()) {
1137 // this intermediate string allows to add a null terminator before calling strtol
1138 std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length());
1139 std::string err;
1140 *value = strict_strtoll(strvalue.c_str(), 10, &err);
1141 if (!err.empty()) {
1142 lderr(cct()) << key << " : " << err << dendl;
1143 return -EINVAL;
1144 }
1145 } else {
1146 return -ENOENT;
1147 }
1148 return 0;
1149 }
1150
1151 int libradosstriper::RadosStriperImpl::internal_get_layout_and_size(
1152 const std::string& oid,
1153 ceph_file_layout *layout,
1154 uint64_t *size)
1155 {
1156 // get external attributes of the first rados object
1157 std::map<std::string, bufferlist> attrs;
1158 int rc = m_ioCtx.getxattrs(oid, attrs);
1159 if (rc) return rc;
1160 // deal with stripe_unit
1161 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_UNIT, &layout->fl_stripe_unit);
1162 if (rc) return rc;
1163 // deal with stripe_count
1164 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_COUNT, &layout->fl_stripe_count);
1165 if (rc) return rc;
1166 // deal with object_size
1167 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_OBJECT_SIZE, &layout->fl_object_size);
1168 if (rc) return rc;
1169 // deal with size
1170 size_t ssize;
1171 rc = extract_sizet_attr(attrs, XATTR_SIZE, &ssize);
1172 if (rc) {
1173 return rc;
1174 }
1175 *size = ssize;
1176 // make valgrind happy by setting unused fl_pg_pool
1177 layout->fl_pg_pool = 0;
1178 return 0;
1179 }
1180
1181 int libradosstriper::RadosStriperImpl::openStripedObjectForRead(
1182 const std::string& soid,
1183 ceph_file_layout *layout,
1184 uint64_t *size,
1185 std::string *lockCookie)
1186 {
1187 // take a lock the first rados object, if it exists and gets its size
1188 // check, lock and size reading must be atomic and are thus done within a single operation
1189 librados::ObjectWriteOperation op;
1190 op.assert_exists();
1191 *lockCookie = getUUID();
1192 utime_t dur = utime_t();
1193 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_SHARED, *lockCookie, "Tag", "", dur, 0);
1194 std::string firstObjOid = getObjectId(soid, 0);
1195 int rc = m_ioCtx.operate(firstObjOid, &op);
1196 if (rc) {
1197 // error case (including -ENOENT)
1198 return rc;
1199 }
1200 rc = internal_get_layout_and_size(firstObjOid, layout, size);
1201 if (rc) {
1202 unlockObject(soid, *lockCookie);
1203 lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : "
1204 << "could not load layout and size for "
1205 << soid << " : rc = " << rc << dendl;
1206 }
1207 return rc;
1208 }
1209
1210 int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string& soid,
1211 ceph_file_layout *layout,
1212 uint64_t *size,
1213 std::string *lockCookie,
1214 bool isFileSizeAbsolute)
1215 {
1216 // take a lock the first rados object, if it exists
1217 // check and lock must be atomic and are thus done within a single operation
1218 librados::ObjectWriteOperation op;
1219 op.assert_exists();
1220 *lockCookie = getUUID();
1221 utime_t dur = utime_t();
1222 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_SHARED, *lockCookie, "Tag", "", dur, 0);
1223 std::string firstObjOid = getObjectId(soid, 0);
1224 int rc = m_ioCtx.operate(firstObjOid, &op);
1225 if (rc) {
1226 if (rc == -ENOENT) {
1227 // object does not exist, delegate to createEmptyStripedObject
1228 int rc = createAndOpenStripedObject(soid, layout, *size, lockCookie, isFileSizeAbsolute);
1229 // return original size
1230 *size = 0;
1231 return rc;
1232 } else {
1233 return rc;
1234 }
1235 }
1236 // all fine
1237 uint64_t curSize;
1238 rc = internal_get_layout_and_size(firstObjOid, layout, &curSize);
1239 if (rc) {
1240 unlockObject(soid, *lockCookie);
1241 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1242 << "could not load layout and size for "
1243 << soid << " : rc = " << rc << dendl;
1244 return rc;
1245 }
1246 // atomically update object size, only if smaller than current one
1247 if (!isFileSizeAbsolute)
1248 *size += curSize;
1249 librados::ObjectWriteOperation writeOp;
1250 writeOp.cmpxattr(XATTR_SIZE, LIBRADOS_CMPXATTR_OP_GT, *size);
1251 std::ostringstream oss;
1252 oss << *size;
1253 bufferlist bl;
1254 bl.append(oss.str());
1255 writeOp.setxattr(XATTR_SIZE, bl);
1256 rc = m_ioCtx.operate(firstObjOid, &writeOp);
1257 // return current size
1258 *size = curSize;
1259 // handle case where objectsize is already bigger than size
1260 if (-ECANCELED == rc)
1261 rc = 0;
1262 if (rc) {
1263 unlockObject(soid, *lockCookie);
1264 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1265 << "could not set new size for "
1266 << soid << " : rc = " << rc << dendl;
1267 }
1268 return rc;
1269 }
1270
1271 int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string& soid,
1272 ceph_file_layout *layout,
1273 uint64_t size,
1274 std::string *lockCookie,
1275 bool isFileSizeAbsolute)
1276 {
1277 // build atomic write operation
1278 librados::ObjectWriteOperation writeOp;
1279 writeOp.create(true);
1280 // object_size
1281 std::ostringstream oss_object_size;
1282 oss_object_size << m_layout.fl_object_size;
1283 bufferlist bl_object_size;
1284 bl_object_size.append(oss_object_size.str());
1285 writeOp.setxattr(XATTR_LAYOUT_OBJECT_SIZE, bl_object_size);
1286 // stripe unit
1287 std::ostringstream oss_stripe_unit;
1288 oss_stripe_unit << m_layout.fl_stripe_unit;
1289 bufferlist bl_stripe_unit;
1290 bl_stripe_unit.append(oss_stripe_unit.str());
1291 writeOp.setxattr(XATTR_LAYOUT_STRIPE_UNIT, bl_stripe_unit);
1292 // stripe count
1293 std::ostringstream oss_stripe_count;
1294 oss_stripe_count << m_layout.fl_stripe_count;
1295 bufferlist bl_stripe_count;
1296 bl_stripe_count.append(oss_stripe_count.str());
1297 writeOp.setxattr(XATTR_LAYOUT_STRIPE_COUNT, bl_stripe_count);
1298 // size
1299 std::ostringstream oss_size;
1300 oss_size << (isFileSizeAbsolute?size:0);
1301 bufferlist bl_size;
1302 bl_size.append(oss_size.str());
1303 writeOp.setxattr(XATTR_SIZE, bl_size);
1304 // effectively change attributes
1305 std::string firstObjOid = getObjectId(soid, 0);
1306 int rc = m_ioCtx.operate(firstObjOid, &writeOp);
1307 // in case of error (but no EEXIST which would mean the object existed), return
1308 if (rc && -EEXIST != rc) return rc;
1309 // Otherwise open the object
1310 uint64_t fileSize = size;
1311 return openStripedObjectForWrite(soid, layout, &fileSize, lockCookie, isFileSizeAbsolute);
1312 }
1313
1314 static void striper_truncate_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
1315 {
1316 libradosstriper::RadosStriperImpl::TruncateCompletionData *cdata =
1317 reinterpret_cast<libradosstriper::RadosStriperImpl::TruncateCompletionData*>(arg);
1318 libradosstriper::MultiAioCompletionImpl *comp =
1319 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1320 if (0 == comp->rval) {
1321 // all went fine, change size in the external attributes
1322 std::ostringstream oss;
1323 oss << cdata->m_size;
1324 bufferlist bl;
1325 bl.append(oss.str());
1326 cdata->m_striper->setxattr(cdata->m_soid, XATTR_SIZE, bl);
1327 }
1328 cdata->put();
1329 }
1330
1331 int libradosstriper::RadosStriperImpl::truncate(const std::string& soid,
1332 uint64_t original_size,
1333 uint64_t size,
1334 ceph_file_layout &layout)
1335 {
1336 TruncateCompletionData *cdata = new TruncateCompletionData(this, soid, size);
1337 libradosstriper::MultiAioCompletionImpl *multi_completion =
1338 new libradosstriper::MultiAioCompletionImpl;
1339 multi_completion->set_complete_callback(cdata, striper_truncate_aio_req_complete);
1340 // call asynchrous version of truncate
1341 int rc = aio_truncate(soid, multi_completion, original_size, size, layout);
1342 // wait for completion of the truncation
1343 multi_completion->finish_adding_requests();
1344 multi_completion->wait_for_complete_and_cb();
1345 // return result
1346 if (rc == 0) {
1347 rc = multi_completion->get_return_value();
1348 }
1349 multi_completion->put();
1350 return rc;
1351 }
1352
1353 int libradosstriper::RadosStriperImpl::aio_truncate
1354 (const std::string& soid,
1355 libradosstriper::MultiAioCompletionImpl *multi_completion,
1356 uint64_t original_size,
1357 uint64_t size,
1358 ceph_file_layout &layout)
1359 {
1360 // handle the underlying rados objects. 3 cases here :
1361 // -- the objects belonging to object sets entirely located
1362 // before the truncation are unchanged
1363 // -- the objects belonging to the object set where the
1364 // truncation took place are truncated or removed
1365 // -- the objects belonging to object sets entirely located
1366 // after the truncation are removed
1367 // Note that we do it backward and that we change the size in
1368 // the external attributes only at the end. This make sure that
1369 // no rados object stays behind if we remove the striped object
1370 // after a truncation has failed
1371 uint64_t trunc_objectsetno = size / layout.fl_object_size / layout.fl_stripe_count;
1372 uint64_t last_objectsetno = original_size / layout.fl_object_size / layout.fl_stripe_count;
1373 bool exists = false;
1374 for (int64_t objectno = (last_objectsetno+1) * layout.fl_stripe_count-1;
1375 objectno >= (int64_t)((trunc_objectsetno + 1) * layout.fl_stripe_count);
1376 objectno--) {
1377 // if no object existed so far, check object existence
1378 if (!exists) {
1379 uint64_t nb_full_object_set = objectno / layout.fl_stripe_count;
1380 uint64_t object_index_in_set = objectno % layout.fl_stripe_count;
1381 uint64_t set_start_off = nb_full_object_set * layout.fl_object_size * layout.fl_stripe_count;
1382 uint64_t object_start_off = set_start_off + object_index_in_set * layout.fl_stripe_unit;
1383 exists = (original_size > object_start_off);
1384 }
1385 if (exists) {
1386 // remove asynchronously
1387 multi_completion->add_request();
1388 RadosRemoveCompletionData *data =
1389 new RadosRemoveCompletionData(multi_completion, cct());
1390 librados::AioCompletion *rados_completion =
1391 librados::Rados::aio_create_completion(data,
1392 rados_req_remove_complete,
1393 rados_req_remove_safe);
1394 int rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion);
1395 rados_completion->release();
1396 // in case the object did not exist, it means we had a sparse file, all is fine
1397 if (rc && rc != -ENOENT) return rc;
1398 }
1399 }
1400 for (int64_t objectno = ((trunc_objectsetno + 1) * layout.fl_stripe_count) -1;
1401 objectno >= (int64_t)(trunc_objectsetno * layout.fl_stripe_count);
1402 objectno--) {
1403 // if no object existed so far, check object existence
1404 if (!exists) {
1405 uint64_t object_start_off = ((objectno / layout.fl_stripe_count) * layout.fl_object_size) +
1406 ((objectno % layout.fl_stripe_count) * layout.fl_stripe_unit);
1407 exists = (original_size > object_start_off);
1408 }
1409 if (exists) {
1410 // truncate
1411 file_layout_t l;
1412 l.from_legacy(layout);
1413 uint64_t new_object_size = Striper::object_truncate_size(cct(), &l, objectno, size);
1414 int rc;
1415 if (new_object_size > 0 or 0 == objectno) {
1416 // trunc is synchronous as there is no async version
1417 // but note that only a single object will be truncated
1418 // reducing the overload to a fixed amount
1419 rc = m_ioCtx.trunc(getObjectId(soid, objectno), new_object_size);
1420 } else {
1421 // removes are asynchronous in order to speed up truncations of big files
1422 multi_completion->add_request();
1423 RadosRemoveCompletionData *data =
1424 new RadosRemoveCompletionData(multi_completion, cct());
1425 librados::AioCompletion *rados_completion =
1426 librados::Rados::aio_create_completion(data,
1427 rados_req_remove_complete,
1428 rados_req_remove_safe);
1429 rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion);
1430 rados_completion->release();
1431 }
1432 // in case the object did not exist, it means we had a sparse file, all is fine
1433 if (rc && rc != -ENOENT) return rc;
1434 }
1435 }
1436 return 0;
1437 }
1438
1439 int libradosstriper::RadosStriperImpl::grow(const std::string& soid,
1440 uint64_t original_size,
1441 uint64_t size,
1442 ceph_file_layout &layout)
1443 {
1444 // handle the underlying rados objects. As we support sparse objects,
1445 // we only have to change the size in the external attributes
1446 std::ostringstream oss;
1447 oss << size;
1448 bufferlist bl;
1449 bl.append(oss.str());
1450 int rc = m_ioCtx.setxattr(getObjectId(soid, 0), XATTR_SIZE, bl);
1451 return rc;
1452 }
1453
1454 std::string libradosstriper::RadosStriperImpl::getUUID()
1455 {
1456 struct uuid_d uuid;
1457 uuid.generate_random();
1458 char suuid[37];
1459 uuid.print(suuid);
1460 return std::string(suuid);
1461 }