1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "libradosstriper/RadosStriperImpl.h"
23 #include "include/types.h"
24 #include "include/uuid.h"
25 #include "include/ceph_fs.h"
26 #include "common/dout.h"
27 #include "common/strtol.h"
28 #include "osdc/Striper.h"
29 #include "librados/AioCompletionImpl.h"
30 #include <cls/lock/cls_lock_client.h>
33 * This file contents the actual implementation of the rados striped objects interface.
35 * Striped objects are stored in rados in a set of regular rados objects, after their
36 * content has been striped using the osdc/Striper interface.
38 * The external attributes of the striped object are mapped to the attributes of the
39 * first underlying object. This first object has a set of extra external attributes
40 * storing the layout of the striped object for future read back. These attributes are :
41 * - striper.layout.object_size : the size of rados objects used.
42 * Must be a multiple of striper.layout.stripe_unit
43 * - striper.layout.stripe_unit : the size of a stripe unit
44 * - striper.layout.stripe_count : the number of stripes used
45 * - striper.size : total striped object size
47 * In general operations on striped objects are not atomic.
48 * However, a certain number of safety guards have been put to make the interface closer
50 * - each data operation takes a shared lock on the first rados object for the
51 * whole time of the operation
52 * - the remove and trunc operations take an exclusive lock on the first rados object
53 * for the whole time of the operation
54 * This makes sure that no removal/truncation of a striped object occurs while
55 * data operations are happening and vice versa. It thus makes sure that the layout
56 * of a striped object does not change during data operation, which is essential for
59 * Still the writing to a striped object is not atomic. This means in particular that
60 * the size of an object may not be in sync with its content at all times.
61 * As the size is always garanteed to be updated first and in an atomic way, and as
62 * sparse striped objects are supported (see below), what will typically happen is
63 * that a reader that comes too soon after a write will read 0s instead of the actual
66 * Note that remove handles the pieces of the striped object in reverse order,
67 * so that the head object is removed last, making the completion of the deletion atomic.
69 * Striped objects can be sparse, typically in case data was written at the end of the
70 * striped object only. In such a case, some rados objects constituing the striped object
71 * may be missing. Other can be partial (only the beginning will have data)
72 * When dealing with such sparse striped files, missing objects are detected and
73 * considered as full of 0s. They are however not created until real data is written
76 * There are a number of missing features/improvements that could be implemented.
77 * Here are some ideas :
78 * - implementation of missing entry points (compared to rados)
79 * In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ...
83 #define dout_subsys ceph_subsys_rados
85 #define dout_prefix *_dout << "libradosstriper: "
87 /// size of xattr buffer
88 #define XATTR_BUFFER_SIZE 32
90 /// names of the different xattr entries
91 #define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit"
92 #define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count"
93 #define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size"
94 #define XATTR_SIZE "striper.size"
95 #define LOCK_PREFIX "lock."
97 /// name of the lock used on objects to ensure layout stability during IO
98 #define RADOS_LOCK_NAME "striper.lock"
100 /// format of the extension of rados objects created for a given striped object
101 #define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx"
103 /// default object layout
104 struct ceph_file_layout default_file_layout
= {
105 init_le32(1<<22), // fl_stripe_unit
106 init_le32(1), // fl_stripe_count
107 init_le32(1<<22), // fl_object_size
108 init_le32(0), // fl_cas_hash
109 init_le32(0), // fl_object_stripe_unit
110 init_le32(-1), // fl_unused
111 init_le32(-1), // fl_pg_pool
114 using libradosstriper::MultiAioCompletionImplPtr
;
118 ///////////////////////// CompletionData /////////////////////////////
121 * struct handling the data needed to pass to the call back
122 * function in asynchronous operations
124 struct CompletionData
: RefCountedObject
{
126 CompletionData(libradosstriper::RadosStriperImpl
* striper
,
127 const std::string
& soid
,
128 const std::string
& lockCookie
,
129 librados::AioCompletionImpl
*userCompletion
= 0,
132 ~CompletionData() override
;
134 void complete(int r
);
135 /// striper to be used to handle the write completion
136 libradosstriper::RadosStriperImpl
*m_striper
;
137 /// striped object concerned by the write operation
139 /// shared lock to be released at completion
140 std::string m_lockCookie
;
141 /// completion handler
142 librados::IoCtxImpl::C_aio_Complete
*m_ack
;
145 CompletionData::CompletionData
146 (libradosstriper::RadosStriperImpl
* striper
,
147 const std::string
& soid
,
148 const std::string
& lockCookie
,
149 librados::AioCompletionImpl
*userCompletion
,
151 RefCountedObject(striper
->cct(), n
),
152 m_striper(striper
), m_soid(soid
), m_lockCookie(lockCookie
), m_ack(0) {
154 if (userCompletion
) {
155 m_ack
= new librados::IoCtxImpl::C_aio_Complete(userCompletion
);
156 userCompletion
->io
= striper
->m_ioCtxImpl
;
160 CompletionData::~CompletionData() {
161 if (m_ack
) delete m_ack
;
165 void CompletionData::complete(int r
) {
166 if (m_ack
) m_ack
->finish(r
);
170 * struct handling the data needed to pass to the call back
171 * function in asynchronous read operations
173 struct ReadCompletionData
: CompletionData
{
174 /// bufferlist containing final result
176 /// extents that will be read
177 std::vector
<ObjectExtent
>* m_extents
;
178 /// intermediate results
179 std::vector
<bufferlist
>* m_resultbl
;
180 /// return code of read completion, to be remembered until unlocking happened
182 /// completion object for the unlocking of the striped object at the end of the read
183 librados::AioCompletion
*m_unlockCompletion
;
185 ReadCompletionData(libradosstriper::RadosStriperImpl
* striper
,
186 const std::string
& soid
,
187 const std::string
& lockCookie
,
188 librados::AioCompletionImpl
*userCompletion
,
190 std::vector
<ObjectExtent
>* extents
,
191 std::vector
<bufferlist
>* resultbl
,
194 ~ReadCompletionData() override
;
195 /// complete method for when reading is over
196 void complete_read(int r
);
197 /// complete method for when object is unlocked
198 void complete_unlock(int r
);
201 ReadCompletionData::ReadCompletionData
202 (libradosstriper::RadosStriperImpl
* striper
,
203 const std::string
& soid
,
204 const std::string
& lockCookie
,
205 librados::AioCompletionImpl
*userCompletion
,
207 std::vector
<ObjectExtent
>* extents
,
208 std::vector
<bufferlist
>* resultbl
,
210 CompletionData(striper
, soid
, lockCookie
, userCompletion
, n
),
211 m_bl(bl
), m_extents(extents
), m_resultbl(resultbl
), m_readRc(0),
212 m_unlockCompletion(0) {}
214 ReadCompletionData::~ReadCompletionData() {
215 m_unlockCompletion
->release();
220 void ReadCompletionData::complete_read(int r
) {
221 // gather data into final buffer
222 Striper::StripedReadResult readResult
;
223 vector
<bufferlist
>::iterator bit
= m_resultbl
->begin();
224 for (vector
<ObjectExtent
>::iterator eit
= m_extents
->begin();
225 eit
!= m_extents
->end();
227 readResult
.add_partial_result(m_striper
->cct(), *bit
, eit
->buffer_extents
);
230 readResult
.assemble_result(m_striper
->cct(), *m_bl
, true);
231 // Remember return code
235 void ReadCompletionData::complete_unlock(int r
) {
236 // call parent's completion method
237 // Note that we ignore the return code of the unlock as we cannot do much about it
238 CompletionData::complete(m_readRc
?m_readRc
:m_bl
->length());
242 * struct handling the data needed to pass to the call back
243 * function in asynchronous write operations
245 struct WriteCompletionData
: CompletionData
{
246 /// safe completion handler
247 librados::IoCtxImpl::C_aio_Complete
*m_safe
;
248 /// return code of write completion, to be remembered until unlocking happened
250 /// completion object for the unlocking of the striped object at the end of the write
251 librados::AioCompletion
*m_unlockCompletion
;
253 WriteCompletionData(libradosstriper::RadosStriperImpl
* striper
,
254 const std::string
& soid
,
255 const std::string
& lockCookie
,
256 librados::AioCompletionImpl
*userCompletion
,
259 ~WriteCompletionData() override
;
260 /// complete method for when writing is over
261 void complete_write(int r
);
262 /// complete method for when object is unlocked
263 void complete_unlock(int r
);
268 WriteCompletionData::WriteCompletionData
269 (libradosstriper::RadosStriperImpl
* striper
,
270 const std::string
& soid
,
271 const std::string
& lockCookie
,
272 librados::AioCompletionImpl
*userCompletion
,
274 CompletionData(striper
, soid
, lockCookie
, userCompletion
, n
), m_safe(0),
275 m_unlockCompletion(0) {
276 if (userCompletion
) {
277 m_safe
= new librados::IoCtxImpl::C_aio_Complete(userCompletion
);
281 WriteCompletionData::~WriteCompletionData() {
282 m_unlockCompletion
->release();
283 if (m_safe
) delete m_safe
;
286 void WriteCompletionData::complete_unlock(int r
) {
287 // call parent's completion method
288 // Note that we ignore the return code of the unlock as we cannot do much about it
289 CompletionData::complete(m_writeRc
);
292 void WriteCompletionData::complete_write(int r
) {
293 // Remember return code
297 void WriteCompletionData::safe(int r
) {
298 if (m_safe
) m_safe
->finish(r
);
301 struct RemoveCompletionData
: CompletionData
{
306 * note that the constructed object will take ownership of the lock
308 RemoveCompletionData(libradosstriper::RadosStriperImpl
* striper
,
309 const std::string
& soid
,
310 const std::string
& lockCookie
,
311 librados::AioCompletionImpl
*userCompletion
,
313 CompletionData(striper
, soid
, lockCookie
, userCompletion
), flags(flags
) {}
317 * struct handling the data needed to pass to the call back
318 * function in asynchronous truncate operations
320 struct TruncateCompletionData
: RefCountedObject
{
322 TruncateCompletionData(libradosstriper::RadosStriperImpl
* striper
,
323 const std::string
& soid
,
325 RefCountedObject(striper
->cct()),
326 m_striper(striper
), m_soid(soid
), m_size(size
) {
330 ~TruncateCompletionData() override
{
333 /// striper to be used
334 libradosstriper::RadosStriperImpl
*m_striper
;
335 /// striped object concerned by the truncate operation
337 /// the final size of the truncated object
342 * struct handling the data needed to pass to the call back
343 * function in asynchronous read operations of a Rados File
345 struct RadosReadCompletionData
: RefCountedObject
{
347 RadosReadCompletionData(MultiAioCompletionImplPtr multiAioCompl
,
348 uint64_t expectedBytes
,
350 CephContext
*context
,
352 RefCountedObject(context
, n
),
353 m_multiAioCompl(multiAioCompl
), m_expectedBytes(expectedBytes
), m_bl(bl
) {}
354 /// the multi asynch io completion object to be used
355 MultiAioCompletionImplPtr m_multiAioCompl
;
356 /// the expected number of bytes
357 uint64_t m_expectedBytes
;
358 /// the bufferlist object where data have been written
363 * struct handling (most of) the data needed to pass to the call back
364 * function in asynchronous stat operations.
365 * Inherited by the actual type for adding time information in different
366 * versions (time_t or struct timespec)
368 struct BasicStatCompletionData
: CompletionData
{
370 BasicStatCompletionData(libradosstriper::RadosStriperImpl
* striper
,
371 const std::string
& soid
,
372 librados::AioCompletionImpl
*userCompletion
,
373 libradosstriper::MultiAioCompletionImpl
*multiCompletion
,
376 CompletionData(striper
, soid
, "", userCompletion
, n
),
377 m_multiCompletion(multiCompletion
), m_psize(psize
),
378 m_statRC(0), m_getxattrRC(0) {};
379 // MultiAioCompletionImpl used to handle the double aysnc
380 // call in the back (stat + getxattr)
381 libradosstriper::MultiAioCompletionImpl
*m_multiCompletion
;
382 // where to store the size of first objct
383 // this will be ignored but we need a place to store it when
384 // async stat is called
385 uint64_t m_objectSize
;
386 // where to store the file size
388 /// the bufferlist object used for the getxattr call
390 /// return code of the stat
392 /// return code of the getxattr
397 * struct handling the data needed to pass to the call back
398 * function in asynchronous stat operations.
399 * Simple templated extension of BasicStatCompletionData.
400 * The template parameter is the type of the time information
401 * (used with time_t for stat and struct timespec for stat2)
403 template<class TimeType
>
404 struct StatCompletionData
: BasicStatCompletionData
{
406 StatCompletionData(libradosstriper::RadosStriperImpl
* striper
,
407 const std::string
& soid
,
408 librados::AioCompletionImpl
*userCompletion
,
409 libradosstriper::MultiAioCompletionImpl
*multiCompletion
,
413 BasicStatCompletionData(striper
, soid
, userCompletion
, multiCompletion
, psize
, n
),
415 // where to store the file time
420 * struct handling the data needed to pass to the call back
421 * function in asynchronous remove operations of a Rados File
423 struct RadosRemoveCompletionData
: RefCountedObject
{
425 RadosRemoveCompletionData(MultiAioCompletionImplPtr multiAioCompl
,
426 CephContext
*context
) :
427 RefCountedObject(context
, 2),
428 m_multiAioCompl(multiAioCompl
) {};
429 /// the multi asynch io completion object to be used
430 MultiAioCompletionImplPtr m_multiAioCompl
;
436 ///////////////////////// constructor /////////////////////////////
438 libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx
& ioctx
, librados::IoCtxImpl
*ioctx_impl
) :
439 m_refCnt(0),lock("RadosStriper Refcont", false, false), m_radosCluster(ioctx
), m_ioCtx(ioctx
), m_ioCtxImpl(ioctx_impl
),
440 m_layout(default_file_layout
) {}
442 ///////////////////////// layout /////////////////////////////
444 int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit
445 (unsigned int stripe_unit
)
447 /* stripe unit must be non-zero, 64k increment */
448 if (!stripe_unit
|| (stripe_unit
& (CEPH_MIN_STRIPE_UNIT
-1)))
450 m_layout
.fl_stripe_unit
= stripe_unit
;
454 int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount
455 (unsigned int stripe_count
)
457 /* stripe count must be non-zero */
460 m_layout
.fl_stripe_count
= stripe_count
;
464 int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize
465 (unsigned int object_size
)
467 /* object size must be non-zero, 64k increment */
468 if (!object_size
|| (object_size
& (CEPH_MIN_STRIPE_UNIT
-1)))
470 /* object size must be a multiple of stripe unit */
471 if (object_size
< m_layout
.fl_stripe_unit
||
472 object_size
% m_layout
.fl_stripe_unit
)
474 m_layout
.fl_object_size
= object_size
;
478 ///////////////////////// xattrs /////////////////////////////
480 int libradosstriper::RadosStriperImpl::getxattr(const object_t
& soid
,
484 std::string firstObjOid
= getObjectId(soid
, 0);
485 return m_ioCtx
.getxattr(firstObjOid
, name
, bl
);
488 int libradosstriper::RadosStriperImpl::setxattr(const object_t
& soid
,
492 std::string firstObjOid
= getObjectId(soid
, 0);
493 return m_ioCtx
.setxattr(firstObjOid
, name
, bl
);
496 int libradosstriper::RadosStriperImpl::getxattrs(const object_t
& soid
,
497 map
<string
, bufferlist
>& attrset
)
499 std::string firstObjOid
= getObjectId(soid
, 0);
500 int rc
= m_ioCtx
.getxattrs(firstObjOid
, attrset
);
502 // cleanup internal attributes dedicated to striping and locking
503 attrset
.erase(XATTR_LAYOUT_STRIPE_UNIT
);
504 attrset
.erase(XATTR_LAYOUT_STRIPE_COUNT
);
505 attrset
.erase(XATTR_LAYOUT_OBJECT_SIZE
);
506 attrset
.erase(XATTR_SIZE
);
507 attrset
.erase(std::string(LOCK_PREFIX
) + RADOS_LOCK_NAME
);
511 int libradosstriper::RadosStriperImpl::rmxattr(const object_t
& soid
,
514 std::string firstObjOid
= getObjectId(soid
, 0);
515 return m_ioCtx
.rmxattr(firstObjOid
, name
);
518 ///////////////////////// io /////////////////////////////
520 int libradosstriper::RadosStriperImpl::write(const std::string
& soid
,
521 const bufferlist
& bl
,
525 // open the object. This will create it if needed, retrieve its layout
526 // and size and take a shared lock on it
527 ceph_file_layout layout
;
528 std::string lockCookie
;
529 int rc
= createAndOpenStripedObject(soid
, &layout
, len
+off
, &lockCookie
, true);
531 return write_in_open_object(soid
, layout
, lockCookie
, bl
, len
, off
);
534 int libradosstriper::RadosStriperImpl::append(const std::string
& soid
,
535 const bufferlist
& bl
,
538 // open the object. This will create it if needed, retrieve its layout
539 // and size and take a shared lock on it
540 ceph_file_layout layout
;
542 std::string lockCookie
;
543 int rc
= openStripedObjectForWrite(soid
, &layout
, &size
, &lockCookie
, false);
545 return write_in_open_object(soid
, layout
, lockCookie
, bl
, len
, size
);
548 int libradosstriper::RadosStriperImpl::write_full(const std::string
& soid
,
549 const bufferlist
& bl
)
551 int rc
= trunc(soid
, 0);
552 if (rc
&& rc
!= -ENOENT
) return rc
; // ENOENT is obviously ok
553 return write(soid
, bl
, bl
.length(), 0);
556 int libradosstriper::RadosStriperImpl::read(const std::string
& soid
,
561 // create a completion object
562 librados::AioCompletionImpl c
;
563 // call asynchronous method
564 int rc
= aio_read(soid
, &c
, bl
, len
, off
);
565 // and wait for completion
567 // wait for completion
568 c
.wait_for_complete_and_cb();
570 rc
= c
.get_return_value();
575 ///////////////////////// asynchronous io /////////////////////////////
577 int libradosstriper::RadosStriperImpl::aio_write(const std::string
& soid
,
578 librados::AioCompletionImpl
*c
,
579 const bufferlist
& bl
,
583 ceph_file_layout layout
;
584 std::string lockCookie
;
585 int rc
= createAndOpenStripedObject(soid
, &layout
, len
+off
, &lockCookie
, true);
587 return aio_write_in_open_object(soid
, c
, layout
, lockCookie
, bl
, len
, off
);
590 int libradosstriper::RadosStriperImpl::aio_append(const std::string
& soid
,
591 librados::AioCompletionImpl
*c
,
592 const bufferlist
& bl
,
595 ceph_file_layout layout
;
597 std::string lockCookie
;
598 int rc
= openStripedObjectForWrite(soid
, &layout
, &size
, &lockCookie
, false);
600 // create a completion object
601 return aio_write_in_open_object(soid
, c
, layout
, lockCookie
, bl
, len
, size
);
604 int libradosstriper::RadosStriperImpl::aio_write_full(const std::string
& soid
,
605 librados::AioCompletionImpl
*c
,
606 const bufferlist
& bl
)
608 int rc
= trunc(soid
, 0);
610 return aio_write(soid
, c
, bl
, bl
.length(), 0);
613 static void rados_read_aio_unlock_complete(rados_striper_multi_completion_t c
, void *arg
)
615 auto cdata
= reinterpret_cast<ReadCompletionData
*>(arg
);
616 libradosstriper::MultiAioCompletionImpl
*comp
=
617 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
618 cdata
->complete_unlock(comp
->rval
);
622 static void striper_read_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
624 auto cdata
= reinterpret_cast<ReadCompletionData
*>(arg
);
625 // launch the async unlocking of the object
626 cdata
->m_striper
->aio_unlockObject(cdata
->m_soid
, cdata
->m_lockCookie
, cdata
->m_unlockCompletion
);
627 // complete the read part in parallel
628 libradosstriper::MultiAioCompletionImpl
*comp
=
629 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
630 cdata
->complete_read(comp
->rval
);
633 static void rados_req_read_safe(rados_completion_t c
, void *arg
)
635 auto data
= reinterpret_cast<RadosReadCompletionData
*>(arg
);
636 int rc
= rados_aio_get_return_value(c
);
637 // ENOENT means that we are dealing with a sparse file. This is fine,
638 // data (0s) will be created on the fly by the rados_req_read_complete method
639 if (rc
== -ENOENT
) rc
= 0;
640 auto multiAioComp
= data
->m_multiAioCompl
;
641 multiAioComp
->safe_request(rc
);
645 static void rados_req_read_complete(rados_completion_t c
, void *arg
)
647 auto data
= reinterpret_cast<RadosReadCompletionData
*>(arg
);
648 int rc
= rados_aio_get_return_value(c
);
649 // We need to handle the case of sparse files here
651 // the object did not exist at all. This can happen for sparse files.
652 // we consider we've read 0 bytes and it will fall into next case
655 if (rc
>= 0 && (((uint64_t)rc
) < data
->m_expectedBytes
)) {
656 // only partial data were present in the object (or the object did not
657 // even exist if we've gone through previous case).
658 // This is typical of sparse file and we need to complete with 0s.
659 unsigned int lenOfZeros
= data
->m_expectedBytes
-rc
;
660 unsigned int existingDataToZero
= min(data
->m_bl
->length()-rc
, lenOfZeros
);
661 if (existingDataToZero
> 0) {
662 data
->m_bl
->zero(rc
, existingDataToZero
);
664 if (lenOfZeros
> existingDataToZero
) {
665 ceph::bufferptr
zeros(ceph::buffer::create(lenOfZeros
-existingDataToZero
));
667 data
->m_bl
->push_back(zeros
);
669 rc
= data
->m_expectedBytes
;
671 auto multiAioComp
= data
->m_multiAioCompl
;
672 multiAioComp
->complete_request(rc
);
676 int libradosstriper::RadosStriperImpl::aio_read(const std::string
& soid
,
677 librados::AioCompletionImpl
*c
,
682 // open the object. This will retrieve its layout and size
683 // and take a shared lock on it
684 ceph_file_layout layout
;
686 std::string lockCookie
;
687 int rc
= openStripedObjectForRead(soid
, &layout
, &size
, &lockCookie
);
689 // find out the actual number of bytes we can read
692 // nothing to read ! We are done.
695 read_len
= min(len
, (size_t)(size
-off
));
697 // get list of extents to be read from
698 vector
<ObjectExtent
> *extents
= new vector
<ObjectExtent
>();
700 std::string format
= soid
+ RADOS_OBJECT_EXTENSION_FORMAT
;
702 l
.from_legacy(layout
);
703 Striper::file_to_extents(cct(), format
.c_str(), &l
, off
, read_len
,
707 // create a completion object and transfer ownership of extents and resultbl
708 vector
<bufferlist
> *resultbl
= new vector
<bufferlist
>(extents
->size());
709 ReadCompletionData
*cdata
= new ReadCompletionData(this, soid
, lockCookie
, c
,
710 bl
, extents
, resultbl
, 1);
713 // create a completion for the unlocking of the striped object at the end of the read
714 librados::AioCompletion
*unlock_completion
=
715 librados::Rados::aio_create_completion(cdata
, rados_read_aio_unlock_complete
, 0);
716 cdata
->m_unlockCompletion
= unlock_completion
;
717 // create the multiCompletion object handling the reads
718 MultiAioCompletionImplPtr nc
{new libradosstriper::MultiAioCompletionImpl
,
720 nc
->set_complete_callback(cdata
, striper_read_aio_req_complete
);
721 // go through the extents
723 for (vector
<ObjectExtent
>::iterator p
= extents
->begin(); p
!= extents
->end(); ++p
) {
724 // create a buffer list describing where to place data read from current extend
725 bufferlist
*oid_bl
= &((*resultbl
)[i
++]);
726 for (vector
<pair
<uint64_t,uint64_t> >::iterator q
= p
->buffer_extents
.begin();
727 q
!= p
->buffer_extents
.end();
729 bufferlist buffer_bl
;
730 buffer_bl
.substr_of(*bl
, q
->first
, q
->second
);
731 oid_bl
->append(buffer_bl
);
733 // read all extends of a given object in one go
735 // we need 2 references on data as both rados_req_read_safe and rados_req_read_complete
737 RadosReadCompletionData
*data
= new RadosReadCompletionData(nc
, p
->length
, oid_bl
, cct(), 2);
738 librados::AioCompletion
*rados_completion
=
739 librados::Rados::aio_create_completion(data
, rados_req_read_complete
, rados_req_read_safe
);
740 r
= m_ioCtx
.aio_read(p
->oid
.name
, rados_completion
, oid_bl
, p
->length
, p
->offset
);
741 rados_completion
->release();
745 nc
->finish_adding_requests();
749 int libradosstriper::RadosStriperImpl::aio_read(const std::string
& soid
,
750 librados::AioCompletionImpl
*c
,
755 // create a buffer list and store it inside the completion object
757 c
->bl
.push_back(buffer::create_static(len
, buf
));
758 // call the bufferlist version of this method
759 return aio_read(soid
, c
, &c
->bl
, len
, off
);
762 int libradosstriper::RadosStriperImpl::aio_flush()
765 // pass to the rados level
766 ret
= m_ioCtx
.aio_flush();
769 //wait all CompletionData are released
777 ///////////////////////// stat and deletion /////////////////////////////
779 int libradosstriper::RadosStriperImpl::stat(const std::string
& soid
, uint64_t *psize
, time_t *pmtime
)
781 // create a completion object
782 librados::AioCompletionImpl c
;
783 // call asynchronous version of stat
784 int rc
= aio_stat(soid
, &c
, psize
, pmtime
);
786 // wait for completion of the remove
787 c
.wait_for_complete();
789 rc
= c
.get_return_value();
794 static void striper_stat_aio_stat_complete(rados_completion_t c
, void *arg
) {
795 auto data
= reinterpret_cast<BasicStatCompletionData
*>(arg
);
796 int rc
= rados_aio_get_return_value(c
);
798 // remember this has failed
801 data
->m_multiCompletion
->complete_request(rc
);
805 static void striper_stat_aio_getxattr_complete(rados_completion_t c
, void *arg
) {
806 auto data
= reinterpret_cast<BasicStatCompletionData
*>(arg
);
807 int rc
= rados_aio_get_return_value(c
);
808 // We need to handle the case of sparse files here
810 // remember this has failed
811 data
->m_getxattrRC
= rc
;
813 // this intermediate string allows to add a null terminator before calling strtol
815 std::string
strsize(data
->m_bl
.c_str(), data
->m_bl
.length());
816 *data
->m_psize
= strict_strtoll(strsize
.c_str(), 10, &err
);
818 lderr(data
->m_striper
->cct()) << XATTR_SIZE
<< " : " << err
<< dendl
;
819 data
->m_getxattrRC
= -EINVAL
;
823 data
->m_multiCompletion
->complete_request(rc
);
827 static void striper_stat_aio_req_complete(rados_striper_multi_completion_t c
,
829 auto data
= reinterpret_cast<BasicStatCompletionData
*>(arg
);
830 if (data
->m_statRC
) {
831 data
->complete(data
->m_statRC
);
833 if (data
->m_getxattrRC
< 0) {
834 data
->complete(data
->m_getxattrRC
);
842 template<class TimeType
>
843 int libradosstriper::RadosStriperImpl::aio_generic_stat
844 (const std::string
& soid
,
845 librados::AioCompletionImpl
*c
,
848 typename
libradosstriper::RadosStriperImpl::StatFunction
<TimeType
>::Type statFunction
)
850 // use a MultiAioCompletion object for dealing with the fact
851 // that we'll do 2 asynchronous calls in parallel
852 MultiAioCompletionImplPtr multi_completion
{
853 new libradosstriper::MultiAioCompletionImpl
, false};
854 // Data object used for passing context to asynchronous calls
855 std::string firstObjOid
= getObjectId(soid
, 0);
856 StatCompletionData
<TimeType
> *cdata
=
857 new StatCompletionData
<TimeType
>(this, firstObjOid
, c
,
858 multi_completion
.get(), psize
, pmtime
, 4);
859 multi_completion
->set_complete_callback(cdata
, striper_stat_aio_req_complete
);
860 // use a regular AioCompletion for the stat async call
861 librados::AioCompletion
*stat_completion
=
862 librados::Rados::aio_create_completion(cdata
, striper_stat_aio_stat_complete
, 0);
863 multi_completion
->add_safe_request();
864 object_t
obj(firstObjOid
);
865 int rc
= (m_ioCtxImpl
->*statFunction
)(obj
, stat_completion
->pc
,
866 &cdata
->m_objectSize
, cdata
->m_pmtime
);
867 stat_completion
->release();
869 // nothing is really started so cancel everything
873 // use a regular AioCompletion for the getxattr async call
874 librados::AioCompletion
*getxattr_completion
=
875 librados::Rados::aio_create_completion(cdata
, striper_stat_aio_getxattr_complete
, 0);
876 multi_completion
->add_safe_request();
877 // in parallel, get the pmsize from the first object asynchronously
878 rc
= m_ioCtxImpl
->aio_getxattr(obj
, getxattr_completion
->pc
,
879 XATTR_SIZE
, cdata
->m_bl
);
880 getxattr_completion
->release();
881 multi_completion
->finish_adding_requests();
883 // the async stat is ongoing, so we need to go on
884 // we mark the getxattr as failed in the data object
885 cdata
->m_getxattrRC
= rc
;
886 multi_completion
->complete_request(rc
);
893 int libradosstriper::RadosStriperImpl::aio_stat(const std::string
& soid
,
894 librados::AioCompletionImpl
*c
,
898 return aio_generic_stat
<time_t>(soid
, c
, psize
, pmtime
, &librados::IoCtxImpl::aio_stat
);
901 int libradosstriper::RadosStriperImpl::stat2(const std::string
& soid
, uint64_t *psize
, struct timespec
*pts
)
903 // create a completion object
904 librados::AioCompletionImpl c
;
905 // call asynchronous version of stat
906 int rc
= aio_stat2(soid
, &c
, psize
, pts
);
908 // wait for completion of the remove
909 c
.wait_for_complete_and_cb();
911 rc
= c
.get_return_value();
916 int libradosstriper::RadosStriperImpl::aio_stat2(const std::string
& soid
,
917 librados::AioCompletionImpl
*c
,
919 struct timespec
*pts
)
921 return aio_generic_stat
<struct timespec
>(soid
, c
, psize
, pts
, &librados::IoCtxImpl::aio_stat2
);
924 static void rados_req_remove_complete(rados_completion_t c
, void *arg
)
926 auto cdata
= reinterpret_cast<RadosRemoveCompletionData
*>(arg
);
927 int rc
= rados_aio_get_return_value(c
);
928 // in case the object did not exist, it means we had a sparse file, all is fine
932 cdata
->m_multiAioCompl
->complete_request(rc
);
936 static void rados_req_remove_safe(rados_completion_t c
, void *arg
)
938 auto cdata
= reinterpret_cast<RadosRemoveCompletionData
*>(arg
);
939 int rc
= rados_aio_get_return_value(c
);
940 // in case the object did not exist, it means we had a sparse file, all is fine
944 cdata
->m_multiAioCompl
->safe_request(rc
);
948 static void striper_remove_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
950 auto cdata
= reinterpret_cast<RemoveCompletionData
*>(arg
);
951 libradosstriper::MultiAioCompletionImpl
*comp
=
952 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
953 ldout(cdata
->m_striper
->cct(), 10)
954 << "RadosStriperImpl : striper_remove_aio_req_complete called for "
955 << cdata
->m_soid
<< dendl
;
958 // All went fine, synchronously remove first object
959 rc
= cdata
->m_striper
->m_ioCtx
.remove(cdata
->m_striper
->getObjectId(cdata
->m_soid
, 0),
962 lderr(cdata
->m_striper
->cct())
963 << "RadosStriperImpl : deletion/truncation incomplete for " << cdata
->m_soid
964 << ", as errors were encountered. The file is left present but it's content "
965 << " has been partially removed"
972 int libradosstriper::RadosStriperImpl::remove(const std::string
& soid
, int flags
)
974 // create a completion object
975 librados::AioCompletionImpl c
;
976 // call asynchronous version of remove
977 int rc
= aio_remove(soid
, &c
, flags
);
979 // wait for completion of the remove
980 c
.wait_for_complete_and_cb();
982 rc
= c
.get_return_value();
987 int libradosstriper::RadosStriperImpl::aio_remove(const std::string
& soid
,
988 librados::AioCompletionImpl
*c
,
991 // the RemoveCompletionData object will lock the given soid for the duration
993 std::string lockCookie
= getUUID();
994 int rc
= m_ioCtx
.lock_exclusive(getObjectId(soid
, 0), RADOS_LOCK_NAME
, lockCookie
, "", 0, 0);
996 // create CompletionData for the async remove call
997 RemoveCompletionData
*cdata
= new RemoveCompletionData(this, soid
, lockCookie
, c
, flags
);
998 MultiAioCompletionImplPtr multi_completion
{
999 new libradosstriper::MultiAioCompletionImpl
, false};
1000 multi_completion
->set_complete_callback(cdata
, striper_remove_aio_req_complete
);
1001 // call asynchronous internal version of remove
1003 << "RadosStriperImpl : Aio_remove starting for "
1005 rc
= internal_aio_remove(soid
, multi_completion
);
1009 int libradosstriper::RadosStriperImpl::internal_aio_remove(
1010 const std::string
& soid
,
1011 MultiAioCompletionImplPtr multi_completion
,
1014 std::string firstObjOid
= getObjectId(soid
, 0);
1016 // check size and get number of rados objects to delete
1017 uint64_t nb_objects
= 0;
1019 int rc
= getxattr(soid
, XATTR_SIZE
, bl2
);
1021 // no object size (or not able to get it)
1022 // try to find the number of object "by hand"
1025 while (!m_ioCtx
.stat(getObjectId(soid
, nb_objects
), &psize
, &pmtime
)) {
1029 // count total number of rados objects in the striped object
1031 // this intermediate string allows to add a null terminator before calling strtol
1032 std::string
strsize(bl2
.c_str(), bl2
.length());
1033 uint64_t size
= strict_strtoll(strsize
.c_str(), 10, &err
);
1035 lderr(cct()) << XATTR_SIZE
<< " : " << err
<< dendl
;
1039 uint64_t object_size
= m_layout
.fl_object_size
;
1040 uint64_t su
= m_layout
.fl_stripe_unit
;
1041 uint64_t stripe_count
= m_layout
.fl_stripe_count
;
1042 uint64_t nb_complete_sets
= size
/ (object_size
*stripe_count
);
1043 uint64_t remaining_data
= size
% (object_size
*stripe_count
);
1044 uint64_t remaining_stripe_units
= (remaining_data
+ su
-1) / su
;
1045 uint64_t remaining_objects
= std::min(remaining_stripe_units
, stripe_count
);
1046 nb_objects
= nb_complete_sets
* stripe_count
+ remaining_objects
;
1048 // delete rados objects in reverse order
1049 // Note that we do not drop the first object. This one will only be dropped
1050 // if all other removals have been successful, and this is done in the
1051 // callback of the multi_completion object
1053 for (int i
= nb_objects
-1; i
>= 1; i
--) {
1054 multi_completion
->add_request();
1055 RadosRemoveCompletionData
*data
=
1056 new RadosRemoveCompletionData(multi_completion
, cct());
1057 librados::AioCompletion
*rados_completion
=
1058 librados::Rados::aio_create_completion(data
,
1059 rados_req_remove_complete
,
1060 rados_req_remove_safe
);
1062 rcr
= m_ioCtx
.aio_remove(getObjectId(soid
, i
), rados_completion
);
1064 rcr
= m_ioCtx
.aio_remove(getObjectId(soid
, i
), rados_completion
, flags
);
1066 rados_completion
->release();
1067 if (rcr
< 0 and -ENOENT
!= rcr
) {
1068 lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid
1069 << ", as " << getObjectId(soid
, i
) << " could not be deleted (rc=" << rc
<< ")"
1074 // we are over adding requests to the multi_completion object
1075 multi_completion
->finish_adding_requests();
1078 } catch (ErrorCode
&e
) {
1079 // errror caught when trying to take the exclusive lock
1085 int libradosstriper::RadosStriperImpl::trunc(const std::string
& soid
, uint64_t size
)
1087 // lock the object in exclusive mode
1088 std::string firstObjOid
= getObjectId(soid
, 0);
1089 librados::ObjectWriteOperation op
;
1091 std::string lockCookie
= RadosStriperImpl::getUUID();
1092 utime_t dur
= utime_t();
1093 rados::cls::lock::lock(&op
, RADOS_LOCK_NAME
, LOCK_EXCLUSIVE
, lockCookie
, "", "", dur
, 0);
1094 int rc
= m_ioCtx
.operate(firstObjOid
, &op
);
1096 // load layout and size
1097 ceph_file_layout layout
;
1098 uint64_t original_size
;
1099 rc
= internal_get_layout_and_size(firstObjOid
, &layout
, &original_size
);
1101 if (size
< original_size
) {
1102 rc
= truncate(soid
, original_size
, size
, layout
);
1103 } else if (size
> original_size
) {
1104 rc
= grow(soid
, original_size
, size
, layout
);
1107 // unlock object, ignore return code as we cannot do much
1108 m_ioCtx
.unlock(firstObjOid
, RADOS_LOCK_NAME
, lockCookie
);
1114 ///////////////////////// private helpers /////////////////////////////
1116 std::string
libradosstriper::RadosStriperImpl::getObjectId(const object_t
& soid
,
1117 long long unsigned objectno
)
1119 std::ostringstream s
;
1120 s
<< soid
<< '.' << std::setfill ('0') << std::setw(16) << std::hex
<< objectno
;
1124 void libradosstriper::RadosStriperImpl::unlockObject(const std::string
& soid
,
1125 const std::string
& lockCookie
)
1127 // unlock the shared lock on the first rados object
1128 std::string firstObjOid
= getObjectId(soid
, 0);
1129 m_ioCtx
.unlock(firstObjOid
, RADOS_LOCK_NAME
, lockCookie
);
1132 void libradosstriper::RadosStriperImpl::aio_unlockObject(const std::string
& soid
,
1133 const std::string
& lockCookie
,
1134 librados::AioCompletion
*c
)
1136 // unlock the shared lock on the first rados object
1137 std::string firstObjOid
= getObjectId(soid
, 0);
1138 m_ioCtx
.aio_unlock(firstObjOid
, RADOS_LOCK_NAME
, lockCookie
, c
);
1141 static void rados_write_aio_unlock_complete(rados_striper_multi_completion_t c
, void *arg
)
1143 auto cdata
= reinterpret_cast<WriteCompletionData
*>(arg
);
1144 libradosstriper::MultiAioCompletionImpl
*comp
=
1145 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1146 cdata
->complete_unlock(comp
->rval
);
1150 static void striper_write_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
1152 auto cdata
= reinterpret_cast<WriteCompletionData
*>(arg
);
1153 // launch the async unlocking of the object
1154 cdata
->m_striper
->aio_unlockObject(cdata
->m_soid
, cdata
->m_lockCookie
, cdata
->m_unlockCompletion
);
1155 // complete the write part in parallel
1156 libradosstriper::MultiAioCompletionImpl
*comp
=
1157 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1158 cdata
->complete_write(comp
->rval
);
1162 static void striper_write_aio_req_safe(rados_striper_multi_completion_t c
, void *arg
)
1164 auto cdata
= reinterpret_cast<WriteCompletionData
*>(arg
);
1165 libradosstriper::MultiAioCompletionImpl
*comp
=
1166 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1167 cdata
->safe(comp
->rval
);
1171 int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string
& soid
,
1172 const ceph_file_layout
& layout
,
1173 const std::string
& lockCookie
,
1174 const bufferlist
& bl
,
1177 // create a completion object to be passed to the callbacks of the multicompletion
1178 // we need 3 references as striper_write_aio_req_complete will release two and
1179 // striper_write_aio_req_safe will release one
1180 WriteCompletionData
*cdata
= new WriteCompletionData(this, soid
, lockCookie
, 0, 3);
1181 cdata
->get(); // local ref
1182 // create a completion object for the unlocking of the striped object at the end of the write
1183 librados::AioCompletion
*unlock_completion
=
1184 librados::Rados::aio_create_completion(cdata
, rados_write_aio_unlock_complete
, 0);
1185 cdata
->m_unlockCompletion
= unlock_completion
;
1186 // create the multicompletion that will handle the write completion
1187 MultiAioCompletionImplPtr c
{new libradosstriper::MultiAioCompletionImpl
,
1189 c
->set_complete_callback(cdata
, striper_write_aio_req_complete
);
1190 c
->set_safe_callback(cdata
, striper_write_aio_req_safe
);
1191 // call the asynchronous API
1192 int rc
= internal_aio_write(soid
, c
, bl
, len
, off
, layout
);
1194 // wait for completion and safety of data
1195 c
->wait_for_complete_and_cb();
1196 c
->wait_for_safe_and_cb();
1197 // wait for the unlocking
1198 unlock_completion
->wait_for_complete();
1200 rc
= c
->get_return_value();
1206 int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string
& soid
,
1207 librados::AioCompletionImpl
*c
,
1208 const ceph_file_layout
& layout
,
1209 const std::string
& lockCookie
,
1210 const bufferlist
& bl
,
1213 // create a completion object to be passed to the callbacks of the multicompletion
1214 // we need 3 references as striper_write_aio_req_complete will release two and
1215 // striper_write_aio_req_safe will release one
1216 WriteCompletionData
*cdata
= new WriteCompletionData(this, soid
, lockCookie
, c
, 3);
1217 cdata
->get(); // local ref
1219 c
->io
= m_ioCtxImpl
;
1220 // create a completion object for the unlocking of the striped object at the end of the write
1221 librados::AioCompletion
*unlock_completion
=
1222 librados::Rados::aio_create_completion(cdata
, rados_write_aio_unlock_complete
, 0);
1223 cdata
->m_unlockCompletion
= unlock_completion
;
1224 // create the multicompletion that will handle the write completion
1225 libradosstriper::MultiAioCompletionImplPtr nc
{
1226 new libradosstriper::MultiAioCompletionImpl
, false};
1227 nc
->set_complete_callback(cdata
, striper_write_aio_req_complete
);
1228 nc
->set_safe_callback(cdata
, striper_write_aio_req_safe
);
1229 // internal asynchronous API
1230 int rc
= internal_aio_write(soid
, nc
, bl
, len
, off
, layout
);
1235 static void rados_req_write_safe(rados_completion_t c
, void *arg
)
1237 libradosstriper::MultiAioCompletionImpl
*comp
=
1238 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(arg
);
1239 comp
->safe_request(rados_aio_get_return_value(c
));
1242 static void rados_req_write_complete(rados_completion_t c
, void *arg
)
1244 libradosstriper::MultiAioCompletionImpl
*comp
=
1245 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(arg
);
1246 comp
->complete_request(rados_aio_get_return_value(c
));
1250 libradosstriper::RadosStriperImpl::internal_aio_write(const std::string
& soid
,
1251 libradosstriper::MultiAioCompletionImplPtr c
,
1252 const bufferlist
& bl
,
1255 const ceph_file_layout
& layout
)
1258 // Do not try anything if we are called with empty buffer,
1259 // file_to_extents would raise an exception
1261 // get list of extents to be written to
1262 vector
<ObjectExtent
> extents
;
1263 std::string format
= soid
+ RADOS_OBJECT_EXTENSION_FORMAT
;
1265 l
.from_legacy(layout
);
1266 Striper::file_to_extents(cct(), format
.c_str(), &l
, off
, len
, 0, extents
);
1267 // go through the extents
1268 for (vector
<ObjectExtent
>::iterator p
= extents
.begin(); p
!= extents
.end(); ++p
) {
1269 // assemble pieces of a given object into a single buffer list
1271 for (vector
<pair
<uint64_t,uint64_t> >::iterator q
= p
->buffer_extents
.begin();
1272 q
!= p
->buffer_extents
.end();
1274 bufferlist buffer_bl
;
1275 buffer_bl
.substr_of(bl
, q
->first
, q
->second
);
1276 oid_bl
.append(buffer_bl
);
1278 // and write the object
1280 librados::AioCompletion
*rados_completion
=
1281 librados::Rados::aio_create_completion(c
.get(),
1282 rados_req_write_complete
,
1283 rados_req_write_safe
);
1284 r
= m_ioCtx
.aio_write(p
->oid
.name
, rados_completion
, oid_bl
,
1285 p
->length
, p
->offset
);
1286 rados_completion
->release();
1291 c
->finish_adding_requests();
1295 int libradosstriper::RadosStriperImpl::extract_uint32_attr
1296 (std::map
<std::string
, bufferlist
> &attrs
,
1297 const std::string
& key
,
1300 std::map
<std::string
, bufferlist
>::iterator attrsIt
= attrs
.find(key
);
1301 if (attrsIt
!= attrs
.end()) {
1302 // this intermediate string allows to add a null terminator before calling strtol
1303 std::string
strvalue(attrsIt
->second
.c_str(), attrsIt
->second
.length());
1305 *value
= strict_strtol(strvalue
.c_str(), 10, &err
);
1307 lderr(cct()) << key
<< " : " << err
<< dendl
;
1316 int libradosstriper::RadosStriperImpl::extract_sizet_attr
1317 (std::map
<std::string
, bufferlist
> &attrs
,
1318 const std::string
& key
,
1321 std::map
<std::string
, bufferlist
>::iterator attrsIt
= attrs
.find(key
);
1322 if (attrsIt
!= attrs
.end()) {
1323 // this intermediate string allows to add a null terminator before calling strtol
1324 std::string
strvalue(attrsIt
->second
.c_str(), attrsIt
->second
.length());
1326 *value
= strict_strtoll(strvalue
.c_str(), 10, &err
);
1328 lderr(cct()) << key
<< " : " << err
<< dendl
;
1337 int libradosstriper::RadosStriperImpl::internal_get_layout_and_size(
1338 const std::string
& oid
,
1339 ceph_file_layout
*layout
,
1342 // get external attributes of the first rados object
1343 std::map
<std::string
, bufferlist
> attrs
;
1344 int rc
= m_ioCtx
.getxattrs(oid
, attrs
);
1346 // deal with stripe_unit
1347 rc
= extract_uint32_attr(attrs
, XATTR_LAYOUT_STRIPE_UNIT
, &layout
->fl_stripe_unit
);
1349 // deal with stripe_count
1350 rc
= extract_uint32_attr(attrs
, XATTR_LAYOUT_STRIPE_COUNT
, &layout
->fl_stripe_count
);
1352 // deal with object_size
1353 rc
= extract_uint32_attr(attrs
, XATTR_LAYOUT_OBJECT_SIZE
, &layout
->fl_object_size
);
1357 rc
= extract_sizet_attr(attrs
, XATTR_SIZE
, &ssize
);
1362 // make valgrind happy by setting unused fl_pg_pool
1363 layout
->fl_pg_pool
= 0;
1367 int libradosstriper::RadosStriperImpl::openStripedObjectForRead(
1368 const std::string
& soid
,
1369 ceph_file_layout
*layout
,
1371 std::string
*lockCookie
)
1373 // take a lock the first rados object, if it exists and gets its size
1374 // check, lock and size reading must be atomic and are thus done within a single operation
1375 librados::ObjectWriteOperation op
;
1377 *lockCookie
= getUUID();
1378 utime_t dur
= utime_t();
1379 rados::cls::lock::lock(&op
, RADOS_LOCK_NAME
, LOCK_SHARED
, *lockCookie
, "Tag", "", dur
, 0);
1380 std::string firstObjOid
= getObjectId(soid
, 0);
1381 int rc
= m_ioCtx
.operate(firstObjOid
, &op
);
1383 // error case (including -ENOENT)
1386 rc
= internal_get_layout_and_size(firstObjOid
, layout
, size
);
1388 unlockObject(soid
, *lockCookie
);
1389 lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : "
1390 << "could not load layout and size for "
1391 << soid
<< " : rc = " << rc
<< dendl
;
1396 int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string
& soid
,
1397 ceph_file_layout
*layout
,
1399 std::string
*lockCookie
,
1400 bool isFileSizeAbsolute
)
1402 // take a lock the first rados object, if it exists
1403 // check and lock must be atomic and are thus done within a single operation
1404 librados::ObjectWriteOperation op
;
1406 *lockCookie
= getUUID();
1407 utime_t dur
= utime_t();
1408 rados::cls::lock::lock(&op
, RADOS_LOCK_NAME
, LOCK_SHARED
, *lockCookie
, "Tag", "", dur
, 0);
1409 std::string firstObjOid
= getObjectId(soid
, 0);
1410 int rc
= m_ioCtx
.operate(firstObjOid
, &op
);
1412 if (rc
== -ENOENT
) {
1413 // object does not exist, delegate to createEmptyStripedObject
1414 int rc
= createAndOpenStripedObject(soid
, layout
, *size
, lockCookie
, isFileSizeAbsolute
);
1415 // return original size
1424 rc
= internal_get_layout_and_size(firstObjOid
, layout
, &curSize
);
1426 unlockObject(soid
, *lockCookie
);
1427 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1428 << "could not load layout and size for "
1429 << soid
<< " : rc = " << rc
<< dendl
;
1432 // atomically update object size, only if smaller than current one
1433 if (!isFileSizeAbsolute
)
1435 librados::ObjectWriteOperation writeOp
;
1436 writeOp
.cmpxattr(XATTR_SIZE
, LIBRADOS_CMPXATTR_OP_GT
, *size
);
1437 std::ostringstream oss
;
1440 bl
.append(oss
.str());
1441 writeOp
.setxattr(XATTR_SIZE
, bl
);
1442 rc
= m_ioCtx
.operate(firstObjOid
, &writeOp
);
1443 // return current size
1445 // handle case where objectsize is already bigger than size
1446 if (-ECANCELED
== rc
)
1449 unlockObject(soid
, *lockCookie
);
1450 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1451 << "could not set new size for "
1452 << soid
<< " : rc = " << rc
<< dendl
;
1457 int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string
& soid
,
1458 ceph_file_layout
*layout
,
1460 std::string
*lockCookie
,
1461 bool isFileSizeAbsolute
)
1463 // build atomic write operation
1464 librados::ObjectWriteOperation writeOp
;
1465 writeOp
.create(true);
1467 std::ostringstream oss_object_size
;
1468 oss_object_size
<< m_layout
.fl_object_size
;
1469 bufferlist bl_object_size
;
1470 bl_object_size
.append(oss_object_size
.str());
1471 writeOp
.setxattr(XATTR_LAYOUT_OBJECT_SIZE
, bl_object_size
);
1473 std::ostringstream oss_stripe_unit
;
1474 oss_stripe_unit
<< m_layout
.fl_stripe_unit
;
1475 bufferlist bl_stripe_unit
;
1476 bl_stripe_unit
.append(oss_stripe_unit
.str());
1477 writeOp
.setxattr(XATTR_LAYOUT_STRIPE_UNIT
, bl_stripe_unit
);
1479 std::ostringstream oss_stripe_count
;
1480 oss_stripe_count
<< m_layout
.fl_stripe_count
;
1481 bufferlist bl_stripe_count
;
1482 bl_stripe_count
.append(oss_stripe_count
.str());
1483 writeOp
.setxattr(XATTR_LAYOUT_STRIPE_COUNT
, bl_stripe_count
);
1485 std::ostringstream oss_size
;
1486 oss_size
<< (isFileSizeAbsolute
?size
:0);
1488 bl_size
.append(oss_size
.str());
1489 writeOp
.setxattr(XATTR_SIZE
, bl_size
);
1490 // effectively change attributes
1491 std::string firstObjOid
= getObjectId(soid
, 0);
1492 int rc
= m_ioCtx
.operate(firstObjOid
, &writeOp
);
1493 // in case of error (but no EEXIST which would mean the object existed), return
1494 if (rc
&& -EEXIST
!= rc
) return rc
;
1495 // Otherwise open the object
1496 uint64_t fileSize
= size
;
1497 return openStripedObjectForWrite(soid
, layout
, &fileSize
, lockCookie
, isFileSizeAbsolute
);
1500 static void striper_truncate_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
1502 auto cdata
= reinterpret_cast<TruncateCompletionData
*>(arg
);
1503 libradosstriper::MultiAioCompletionImpl
*comp
=
1504 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1505 if (0 == comp
->rval
) {
1506 // all went fine, change size in the external attributes
1507 std::ostringstream oss
;
1508 oss
<< cdata
->m_size
;
1510 bl
.append(oss
.str());
1511 cdata
->m_striper
->setxattr(cdata
->m_soid
, XATTR_SIZE
, bl
);
1516 int libradosstriper::RadosStriperImpl::truncate(const std::string
& soid
,
1517 uint64_t original_size
,
1519 ceph_file_layout
&layout
)
1521 TruncateCompletionData
*cdata
= new TruncateCompletionData(this, soid
, size
);
1522 libradosstriper::MultiAioCompletionImplPtr multi_completion
{
1523 new libradosstriper::MultiAioCompletionImpl
, false};
1524 multi_completion
->set_complete_callback(cdata
, striper_truncate_aio_req_complete
);
1525 // call asynchrous version of truncate
1526 int rc
= aio_truncate(soid
, multi_completion
, original_size
, size
, layout
);
1527 // wait for completion of the truncation
1528 multi_completion
->finish_adding_requests();
1529 multi_completion
->wait_for_complete_and_cb();
1532 rc
= multi_completion
->get_return_value();
1537 int libradosstriper::RadosStriperImpl::aio_truncate
1538 (const std::string
& soid
,
1539 libradosstriper::MultiAioCompletionImplPtr multi_completion
,
1540 uint64_t original_size
,
1542 ceph_file_layout
&layout
)
1544 // handle the underlying rados objects. 3 cases here :
1545 // -- the objects belonging to object sets entirely located
1546 // before the truncation are unchanged
1547 // -- the objects belonging to the object set where the
1548 // truncation took place are truncated or removed
1549 // -- the objects belonging to object sets entirely located
1550 // after the truncation are removed
1551 // Note that we do it backward and that we change the size in
1552 // the external attributes only at the end. This make sure that
1553 // no rados object stays behind if we remove the striped object
1554 // after a truncation has failed
1555 uint64_t trunc_objectsetno
= size
/ layout
.fl_object_size
/ layout
.fl_stripe_count
;
1556 uint64_t last_objectsetno
= original_size
/ layout
.fl_object_size
/ layout
.fl_stripe_count
;
1557 bool exists
= false;
1558 for (int64_t objectno
= (last_objectsetno
+1) * layout
.fl_stripe_count
-1;
1559 objectno
>= (int64_t)((trunc_objectsetno
+ 1) * layout
.fl_stripe_count
);
1561 // if no object existed so far, check object existence
1563 uint64_t nb_full_object_set
= objectno
/ layout
.fl_stripe_count
;
1564 uint64_t object_index_in_set
= objectno
% layout
.fl_stripe_count
;
1565 uint64_t set_start_off
= nb_full_object_set
* layout
.fl_object_size
* layout
.fl_stripe_count
;
1566 uint64_t object_start_off
= set_start_off
+ object_index_in_set
* layout
.fl_stripe_unit
;
1567 exists
= (original_size
> object_start_off
);
1570 // remove asynchronously
1571 multi_completion
->add_request();
1572 RadosRemoveCompletionData
*data
=
1573 new RadosRemoveCompletionData(multi_completion
, cct());
1574 librados::AioCompletion
*rados_completion
=
1575 librados::Rados::aio_create_completion(data
,
1576 rados_req_remove_complete
,
1577 rados_req_remove_safe
);
1578 int rc
= m_ioCtx
.aio_remove(getObjectId(soid
, objectno
), rados_completion
);
1579 rados_completion
->release();
1580 // in case the object did not exist, it means we had a sparse file, all is fine
1581 if (rc
&& rc
!= -ENOENT
) return rc
;
1584 for (int64_t objectno
= ((trunc_objectsetno
+ 1) * layout
.fl_stripe_count
) -1;
1585 objectno
>= (int64_t)(trunc_objectsetno
* layout
.fl_stripe_count
);
1587 // if no object existed so far, check object existence
1589 uint64_t object_start_off
= ((objectno
/ layout
.fl_stripe_count
) * layout
.fl_object_size
) +
1590 ((objectno
% layout
.fl_stripe_count
) * layout
.fl_stripe_unit
);
1591 exists
= (original_size
> object_start_off
);
1596 l
.from_legacy(layout
);
1597 uint64_t new_object_size
= Striper::object_truncate_size(cct(), &l
, objectno
, size
);
1599 if (new_object_size
> 0 or 0 == objectno
) {
1600 // trunc is synchronous as there is no async version
1601 // but note that only a single object will be truncated
1602 // reducing the overload to a fixed amount
1603 rc
= m_ioCtx
.trunc(getObjectId(soid
, objectno
), new_object_size
);
1605 // removes are asynchronous in order to speed up truncations of big files
1606 multi_completion
->add_request();
1607 RadosRemoveCompletionData
*data
=
1608 new RadosRemoveCompletionData(multi_completion
, cct());
1609 librados::AioCompletion
*rados_completion
=
1610 librados::Rados::aio_create_completion(data
,
1611 rados_req_remove_complete
,
1612 rados_req_remove_safe
);
1613 rc
= m_ioCtx
.aio_remove(getObjectId(soid
, objectno
), rados_completion
);
1614 rados_completion
->release();
1616 // in case the object did not exist, it means we had a sparse file, all is fine
1617 if (rc
&& rc
!= -ENOENT
) return rc
;
1623 int libradosstriper::RadosStriperImpl::grow(const std::string
& soid
,
1624 uint64_t original_size
,
1626 ceph_file_layout
&layout
)
1628 // handle the underlying rados objects. As we support sparse objects,
1629 // we only have to change the size in the external attributes
1630 std::ostringstream oss
;
1633 bl
.append(oss
.str());
1634 int rc
= m_ioCtx
.setxattr(getObjectId(soid
, 0), XATTR_SIZE
, bl
);
1638 std::string
libradosstriper::RadosStriperImpl::getUUID()
1641 uuid
.generate_random();
1644 return std::string(suuid
);