1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/algorithm/string/replace.hpp>
17 #include "libradosstriper/RadosStriperImpl.h"
25 #include "include/types.h"
26 #include "include/uuid.h"
27 #include "include/ceph_fs.h"
28 #include "common/dout.h"
29 #include "common/strtol.h"
30 #include "common/RefCountedObj.h"
31 #include "osdc/Striper.h"
32 #include "librados/AioCompletionImpl.h"
33 #include <cls/lock/cls_lock_client.h>
36 * This file contents the actual implementation of the rados striped objects interface.
38 * Striped objects are stored in rados in a set of regular rados objects, after their
39 * content has been striped using the osdc/Striper interface.
41 * The external attributes of the striped object are mapped to the attributes of the
42 * first underlying object. This first object has a set of extra external attributes
43 * storing the layout of the striped object for future read back. These attributes are :
44 * - striper.layout.object_size : the size of rados objects used.
45 * Must be a multiple of striper.layout.stripe_unit
46 * - striper.layout.stripe_unit : the size of a stripe unit
47 * - striper.layout.stripe_count : the number of stripes used
48 * - striper.size : total striped object size
50 * In general operations on striped objects are not atomic.
51 * However, a certain number of safety guards have been put to make the interface closer
53 * - each data operation takes a shared lock on the first rados object for the
54 * whole time of the operation
55 * - the remove and trunc operations take an exclusive lock on the first rados object
56 * for the whole time of the operation
57 * This makes sure that no removal/truncation of a striped object occurs while
58 * data operations are happening and vice versa. It thus makes sure that the layout
59 * of a striped object does not change during data operation, which is essential for
62 * Still the writing to a striped object is not atomic. This means in particular that
63 * the size of an object may not be in sync with its content at all times.
64 * As the size is always guaranteed to be updated first and in an atomic way, and as
65 * sparse striped objects are supported (see below), what will typically happen is
66 * that a reader that comes too soon after a write will read 0s instead of the actual
69 * Note that remove handles the pieces of the striped object in reverse order,
70 * so that the head object is removed last, making the completion of the deletion atomic.
72 * Striped objects can be sparse, typically in case data was written at the end of the
73 * striped object only. In such a case, some rados objects constituing the striped object
74 * may be missing. Other can be partial (only the beginning will have data)
75 * When dealing with such sparse striped files, missing objects are detected and
76 * considered as full of 0s. They are however not created until real data is written
79 * There are a number of missing features/improvements that could be implemented.
80 * Here are some ideas :
81 * - implementation of missing entry points (compared to rados)
82 * In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ...
86 #define dout_subsys ceph_subsys_rados
88 #define dout_prefix *_dout << "libradosstriper: "
90 /// size of xattr buffer
91 #define XATTR_BUFFER_SIZE 32
93 /// names of the different xattr entries
94 #define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit"
95 #define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count"
96 #define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size"
97 #define XATTR_SIZE "striper.size"
98 #define LOCK_PREFIX "lock."
100 /// name of the lock used on objects to ensure layout stability during IO
101 #define RADOS_LOCK_NAME "striper.lock"
103 /// format of the extension of rados objects created for a given striped object
104 #define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx"
106 /// default object layout
107 static const struct ceph_file_layout default_file_layout
= {
108 ceph_le32(1<<22), // fl_stripe_unit
109 ceph_le32(1), // fl_stripe_count
110 ceph_le32(1<<22), // fl_object_size
111 ceph_le32(0), // fl_cas_hash
112 ceph_le32(0), // fl_object_stripe_unit
113 ceph_le32(-1), // fl_unused
114 ceph_le32(-1), // fl_pg_pool
121 using libradosstriper::MultiAioCompletionImplPtr
;
125 ///////////////////////// CompletionData /////////////////////////////
128 * struct handling the data needed to pass to the call back
129 * function in asynchronous operations
131 struct CompletionData
: RefCountedObject
{
133 void complete(int r
);
134 /// striper to be used to handle the write completion
135 libradosstriper::RadosStriperImpl
*m_striper
;
136 /// striped object concerned by the write operation
138 /// shared lock to be released at completion
139 std::string m_lockCookie
;
140 /// completion handler
141 librados::IoCtxImpl::C_aio_Complete
*m_ack
;
143 CompletionData(libradosstriper::RadosStriperImpl
* striper
,
144 const std::string
& soid
,
145 const std::string
& lockCookie
,
146 librados::AioCompletionImpl
*userCompletion
= 0);
147 ~CompletionData() override
;
151 CompletionData::CompletionData
152 (libradosstriper::RadosStriperImpl
* striper
,
153 const std::string
& soid
,
154 const std::string
& lockCookie
,
155 librados::AioCompletionImpl
*userCompletion
) :
156 RefCountedObject(striper
->cct()),
157 m_striper(striper
), m_soid(soid
), m_lockCookie(lockCookie
), m_ack(0) {
159 if (userCompletion
) {
160 m_ack
= new librados::IoCtxImpl::C_aio_Complete(userCompletion
);
161 userCompletion
->io
= striper
->m_ioCtxImpl
;
165 CompletionData::~CompletionData() {
166 if (m_ack
) delete m_ack
;
170 void CompletionData::complete(int r
) {
171 if (m_ack
) m_ack
->finish(r
);
175 * struct handling the data needed to pass to the call back
176 * function in asynchronous read operations
178 struct ReadCompletionData
: CompletionData
{
179 /// bufferlist containing final result
181 /// extents that will be read
182 std::vector
<ObjectExtent
>* m_extents
;
183 /// intermediate results
184 std::vector
<bufferlist
>* m_resultbl
;
185 /// return code of read completion, to be remembered until unlocking happened
187 /// completion object for the unlocking of the striped object at the end of the read
188 librados::AioCompletion
*m_unlockCompletion
;
189 /// complete method for when reading is over
190 void complete_read(int r
);
191 /// complete method for when object is unlocked
192 void complete_unlock(int r
);
195 FRIEND_MAKE_REF(ReadCompletionData
);
196 ReadCompletionData(libradosstriper::RadosStriperImpl
* striper
,
197 const std::string
& soid
,
198 const std::string
& lockCookie
,
199 librados::AioCompletionImpl
*userCompletion
,
201 std::vector
<ObjectExtent
>* extents
,
202 std::vector
<bufferlist
>* resultbl
);
203 ~ReadCompletionData() override
;
206 ReadCompletionData::ReadCompletionData
207 (libradosstriper::RadosStriperImpl
* striper
,
208 const std::string
& soid
,
209 const std::string
& lockCookie
,
210 librados::AioCompletionImpl
*userCompletion
,
212 std::vector
<ObjectExtent
>* extents
,
213 std::vector
<bufferlist
>* resultbl
) :
214 CompletionData(striper
, soid
, lockCookie
, userCompletion
),
215 m_bl(bl
), m_extents(extents
), m_resultbl(resultbl
), m_readRc(0),
216 m_unlockCompletion(0) {}
218 ReadCompletionData::~ReadCompletionData() {
219 m_unlockCompletion
->release();
224 void ReadCompletionData::complete_read(int r
) {
225 // gather data into final buffer
226 Striper::StripedReadResult readResult
;
227 vector
<bufferlist
>::iterator bit
= m_resultbl
->begin();
228 for (vector
<ObjectExtent
>::iterator eit
= m_extents
->begin();
229 eit
!= m_extents
->end();
231 readResult
.add_partial_result(m_striper
->cct(), *bit
, eit
->buffer_extents
);
234 readResult
.assemble_result(m_striper
->cct(), *m_bl
, true);
235 // Remember return code
239 void ReadCompletionData::complete_unlock(int r
) {
240 // call parent's completion method
241 // Note that we ignore the return code of the unlock as we cannot do much about it
242 CompletionData::complete(m_readRc
?m_readRc
:m_bl
->length());
246 * struct handling the data needed to pass to the call back
247 * function in asynchronous write operations
249 struct WriteCompletionData
: CompletionData
{
250 /// safe completion handler
251 librados::IoCtxImpl::C_aio_Complete
*m_safe
;
252 /// completion object for the unlocking of the striped object at the end of the write
253 librados::AioCompletion
*m_unlockCompletion
;
254 /// return code of write completion, to be remembered until unlocking happened
256 /// complete method for when writing is over
257 void complete_write(int r
);
258 /// complete method for when object is unlocked
259 void complete_unlock(int r
);
263 FRIEND_MAKE_REF(WriteCompletionData
);
265 WriteCompletionData(libradosstriper::RadosStriperImpl
* striper
,
266 const std::string
& soid
,
267 const std::string
& lockCookie
,
268 librados::AioCompletionImpl
*userCompletion
);
270 ~WriteCompletionData() override
;
273 WriteCompletionData::WriteCompletionData
274 (libradosstriper::RadosStriperImpl
* striper
,
275 const std::string
& soid
,
276 const std::string
& lockCookie
,
277 librados::AioCompletionImpl
*userCompletion
) :
278 CompletionData(striper
, soid
, lockCookie
, userCompletion
),
279 m_safe(0), m_unlockCompletion(0), m_writeRc(0) {
280 if (userCompletion
) {
281 m_safe
= new librados::IoCtxImpl::C_aio_Complete(userCompletion
);
285 WriteCompletionData::~WriteCompletionData() {
286 m_unlockCompletion
->release();
287 if (m_safe
) delete m_safe
;
290 void WriteCompletionData::complete_unlock(int r
) {
291 // call parent's completion method
292 // Note that we ignore the return code of the unlock as we cannot do much about it
293 CompletionData::complete(m_writeRc
);
296 void WriteCompletionData::complete_write(int r
) {
297 // Remember return code
301 void WriteCompletionData::safe(int r
) {
302 if (m_safe
) m_safe
->finish(r
);
305 struct RemoveCompletionData
: CompletionData
{
310 FRIEND_MAKE_REF(RemoveCompletionData
);
313 * note that the constructed object will take ownership of the lock
315 RemoveCompletionData(libradosstriper::RadosStriperImpl
* striper
,
316 const std::string
& soid
,
317 const std::string
& lockCookie
,
318 librados::AioCompletionImpl
*userCompletion
,
320 CompletionData(striper
, soid
, lockCookie
, userCompletion
), flags(flags
) {}
324 * struct handling the data needed to pass to the call back
325 * function in asynchronous truncate operations
327 struct TruncateCompletionData
: RefCountedObject
{
328 /// striper to be used
329 libradosstriper::RadosStriperImpl
*m_striper
;
330 /// striped object concerned by the truncate operation
332 /// the final size of the truncated object
336 FRIEND_MAKE_REF(TruncateCompletionData
);
338 TruncateCompletionData(libradosstriper::RadosStriperImpl
* striper
,
339 const std::string
& soid
,
341 RefCountedObject(striper
->cct()),
342 m_striper(striper
), m_soid(soid
), m_size(size
) {
346 ~TruncateCompletionData() override
{
352 * struct handling the data needed to pass to the call back
353 * function in asynchronous read operations of a Rados File
355 struct RadosReadCompletionData
: RefCountedObject
{
356 /// the multi asynch io completion object to be used
357 MultiAioCompletionImplPtr m_multiAioCompl
;
358 /// the expected number of bytes
359 uint64_t m_expectedBytes
;
360 /// the bufferlist object where data have been written
364 FRIEND_MAKE_REF(RadosReadCompletionData
);
366 RadosReadCompletionData(MultiAioCompletionImplPtr multiAioCompl
,
367 uint64_t expectedBytes
,
369 CephContext
*context
) :
370 RefCountedObject(context
),
371 m_multiAioCompl(multiAioCompl
), m_expectedBytes(expectedBytes
), m_bl(bl
) {}
375 * struct handling (most of) the data needed to pass to the call back
376 * function in asynchronous stat operations.
377 * Inherited by the actual type for adding time information in different
378 * versions (time_t or struct timespec)
380 struct BasicStatCompletionData
: CompletionData
{
381 // MultiAioCompletionImpl used to handle the double aysnc
382 // call in the back (stat + getxattr)
383 libradosstriper::MultiAioCompletionImpl
*m_multiCompletion
;
384 // where to store the size of first objct
385 // this will be ignored but we need a place to store it when
386 // async stat is called
387 uint64_t m_objectSize
;
388 // where to store the file size
390 /// the bufferlist object used for the getxattr call
392 /// return code of the stat
394 /// return code of the getxattr
399 BasicStatCompletionData(libradosstriper::RadosStriperImpl
* striper
,
400 const std::string
& soid
,
401 librados::AioCompletionImpl
*userCompletion
,
402 libradosstriper::MultiAioCompletionImpl
*multiCompletion
,
404 CompletionData(striper
, soid
, "", userCompletion
),
405 m_multiCompletion(multiCompletion
), m_psize(psize
),
406 m_statRC(0), m_getxattrRC(0) {};
411 * struct handling the data needed to pass to the call back
412 * function in asynchronous stat operations.
413 * Simple templated extension of BasicStatCompletionData.
414 * The template parameter is the type of the time information
415 * (used with time_t for stat and struct timespec for stat2)
417 template<class TimeType
>
418 struct StatCompletionData
: BasicStatCompletionData
{
419 // where to store the file time
422 FRIEND_MAKE_REF(StatCompletionData
);
424 StatCompletionData(libradosstriper::RadosStriperImpl
* striper
,
425 const std::string
& soid
,
426 librados::AioCompletionImpl
*userCompletion
,
427 libradosstriper::MultiAioCompletionImpl
*multiCompletion
,
430 BasicStatCompletionData(striper
, soid
, userCompletion
, multiCompletion
, psize
),
435 * struct handling the data needed to pass to the call back
436 * function in asynchronous remove operations of a Rados File
438 struct RadosRemoveCompletionData
: RefCountedObject
{
439 /// the multi asynch io completion object to be used
440 MultiAioCompletionImplPtr m_multiAioCompl
;
442 FRIEND_MAKE_REF(RadosRemoveCompletionData
);
444 RadosRemoveCompletionData(MultiAioCompletionImplPtr multiAioCompl
,
445 CephContext
*context
) :
446 RefCountedObject(context
),
447 m_multiAioCompl(multiAioCompl
) {};
453 ///////////////////////// constructor /////////////////////////////
455 libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx
& ioctx
, librados::IoCtxImpl
*ioctx_impl
) :
456 m_refCnt(0), m_radosCluster(ioctx
), m_ioCtx(ioctx
), m_ioCtxImpl(ioctx_impl
),
457 m_layout(default_file_layout
) {}
459 ///////////////////////// layout /////////////////////////////
461 int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit
462 (unsigned int stripe_unit
)
464 /* stripe unit must be non-zero, 64k increment */
465 if (!stripe_unit
|| (stripe_unit
& (CEPH_MIN_STRIPE_UNIT
-1)))
467 m_layout
.fl_stripe_unit
= stripe_unit
;
471 int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount
472 (unsigned int stripe_count
)
474 /* stripe count must be non-zero */
477 m_layout
.fl_stripe_count
= stripe_count
;
481 int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize
482 (unsigned int object_size
)
484 /* object size must be non-zero, 64k increment */
485 if (!object_size
|| (object_size
& (CEPH_MIN_STRIPE_UNIT
-1)))
487 /* object size must be a multiple of stripe unit */
488 if (object_size
< m_layout
.fl_stripe_unit
||
489 object_size
% m_layout
.fl_stripe_unit
)
491 m_layout
.fl_object_size
= object_size
;
495 ///////////////////////// xattrs /////////////////////////////
497 int libradosstriper::RadosStriperImpl::getxattr(const object_t
& soid
,
501 std::string firstObjOid
= getObjectId(soid
, 0);
502 return m_ioCtx
.getxattr(firstObjOid
, name
, bl
);
505 int libradosstriper::RadosStriperImpl::setxattr(const object_t
& soid
,
509 std::string firstObjOid
= getObjectId(soid
, 0);
510 return m_ioCtx
.setxattr(firstObjOid
, name
, bl
);
513 int libradosstriper::RadosStriperImpl::getxattrs(const object_t
& soid
,
514 map
<string
, bufferlist
>& attrset
)
516 std::string firstObjOid
= getObjectId(soid
, 0);
517 int rc
= m_ioCtx
.getxattrs(firstObjOid
, attrset
);
519 // cleanup internal attributes dedicated to striping and locking
520 attrset
.erase(XATTR_LAYOUT_STRIPE_UNIT
);
521 attrset
.erase(XATTR_LAYOUT_STRIPE_COUNT
);
522 attrset
.erase(XATTR_LAYOUT_OBJECT_SIZE
);
523 attrset
.erase(XATTR_SIZE
);
524 attrset
.erase(std::string(LOCK_PREFIX
) + RADOS_LOCK_NAME
);
528 int libradosstriper::RadosStriperImpl::rmxattr(const object_t
& soid
,
531 std::string firstObjOid
= getObjectId(soid
, 0);
532 return m_ioCtx
.rmxattr(firstObjOid
, name
);
535 ///////////////////////// io /////////////////////////////
537 int libradosstriper::RadosStriperImpl::write(const std::string
& soid
,
538 const bufferlist
& bl
,
542 // open the object. This will create it if needed, retrieve its layout
543 // and size and take a shared lock on it
544 ceph_file_layout layout
;
545 std::string lockCookie
;
546 int rc
= createAndOpenStripedObject(soid
, &layout
, len
+off
, &lockCookie
, true);
548 return write_in_open_object(soid
, layout
, lockCookie
, bl
, len
, off
);
551 int libradosstriper::RadosStriperImpl::append(const std::string
& soid
,
552 const bufferlist
& bl
,
555 // open the object. This will create it if needed, retrieve its layout
556 // and size and take a shared lock on it
557 ceph_file_layout layout
;
559 std::string lockCookie
;
560 int rc
= openStripedObjectForWrite(soid
, &layout
, &size
, &lockCookie
, false);
562 return write_in_open_object(soid
, layout
, lockCookie
, bl
, len
, size
);
565 int libradosstriper::RadosStriperImpl::write_full(const std::string
& soid
,
566 const bufferlist
& bl
)
568 int rc
= trunc(soid
, 0);
569 if (rc
&& rc
!= -ENOENT
) return rc
; // ENOENT is obviously ok
570 return write(soid
, bl
, bl
.length(), 0);
573 int libradosstriper::RadosStriperImpl::read(const std::string
& soid
,
578 // create a completion object
579 librados::AioCompletionImpl c
;
580 // call asynchronous method
581 int rc
= aio_read(soid
, &c
, bl
, len
, off
);
582 // and wait for completion
584 // wait for completion
585 c
.wait_for_complete_and_cb();
587 rc
= c
.get_return_value();
592 ///////////////////////// asynchronous io /////////////////////////////
594 int libradosstriper::RadosStriperImpl::aio_write(const std::string
& soid
,
595 librados::AioCompletionImpl
*c
,
596 const bufferlist
& bl
,
600 ceph_file_layout layout
;
601 std::string lockCookie
;
602 int rc
= createAndOpenStripedObject(soid
, &layout
, len
+off
, &lockCookie
, true);
604 return aio_write_in_open_object(soid
, c
, layout
, lockCookie
, bl
, len
, off
);
607 int libradosstriper::RadosStriperImpl::aio_append(const std::string
& soid
,
608 librados::AioCompletionImpl
*c
,
609 const bufferlist
& bl
,
612 ceph_file_layout layout
;
614 std::string lockCookie
;
615 int rc
= openStripedObjectForWrite(soid
, &layout
, &size
, &lockCookie
, false);
617 // create a completion object
618 return aio_write_in_open_object(soid
, c
, layout
, lockCookie
, bl
, len
, size
);
621 int libradosstriper::RadosStriperImpl::aio_write_full(const std::string
& soid
,
622 librados::AioCompletionImpl
*c
,
623 const bufferlist
& bl
)
625 int rc
= trunc(soid
, 0);
627 return aio_write(soid
, c
, bl
, bl
.length(), 0);
630 static void rados_read_aio_unlock_complete(rados_striper_multi_completion_t c
, void *arg
)
632 auto cdata
= ceph::ref_t
<ReadCompletionData
>(static_cast<ReadCompletionData
*>(arg
), false);
633 libradosstriper::MultiAioCompletionImpl
*comp
=
634 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
635 cdata
->complete_unlock(comp
->rval
);
638 static void striper_read_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
640 auto cdata
= static_cast<ReadCompletionData
*>(arg
);
641 // launch the async unlocking of the object
642 cdata
->m_striper
->aio_unlockObject(cdata
->m_soid
, cdata
->m_lockCookie
, cdata
->m_unlockCompletion
);
643 // complete the read part in parallel
644 libradosstriper::MultiAioCompletionImpl
*comp
=
645 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
646 cdata
->complete_read(comp
->rval
);
649 static void rados_req_read_complete(rados_completion_t c
, void *arg
)
651 auto data
= static_cast<RadosReadCompletionData
*>(arg
);
652 int rc
= rados_aio_get_return_value(c
);
653 // We need to handle the case of sparse files here
655 // the object did not exist at all. This can happen for sparse files.
656 // we consider we've read 0 bytes and it will fall into next case
660 if (rc
>= 0 && (((uint64_t)rc
) < data
->m_expectedBytes
)) {
661 // only partial data were present in the object (or the object did not
662 // even exist if we've gone through previous case).
663 // This is typical of sparse file and we need to complete with 0s.
664 unsigned int lenOfZeros
= data
->m_expectedBytes
-rc
;
665 unsigned int existingDataToZero
= std::min(data
->m_bl
->length()-rc
, lenOfZeros
);
666 if (existingDataToZero
> 0) {
667 data
->m_bl
->zero(rc
, existingDataToZero
);
669 if (lenOfZeros
> existingDataToZero
) {
670 ceph::bufferptr
zeros(ceph::buffer::create(lenOfZeros
-existingDataToZero
));
672 data
->m_bl
->push_back(zeros
);
674 nread
= data
->m_expectedBytes
;
676 auto multi_aio_comp
= data
->m_multiAioCompl
;
677 multi_aio_comp
->complete_request(nread
);
678 multi_aio_comp
->safe_request(rc
);
681 int libradosstriper::RadosStriperImpl::aio_read(const std::string
& soid
,
682 librados::AioCompletionImpl
*c
,
687 // open the object. This will retrieve its layout and size
688 // and take a shared lock on it
689 ceph_file_layout layout
;
691 std::string lockCookie
;
692 int rc
= openStripedObjectForRead(soid
, &layout
, &size
, &lockCookie
);
694 // find out the actual number of bytes we can read
697 // nothing to read ! We are done.
700 read_len
= std::min(len
, (size_t)(size
-off
));
702 // get list of extents to be read from
703 vector
<ObjectExtent
> *extents
= new vector
<ObjectExtent
>();
705 std::string format
= soid
;
706 boost::replace_all(format
, "%", "%%");
707 format
+= RADOS_OBJECT_EXTENSION_FORMAT
;
709 l
.from_legacy(layout
);
710 Striper::file_to_extents(cct(), format
.c_str(), &l
, off
, read_len
,
714 // create a completion object and transfer ownership of extents and resultbl
715 vector
<bufferlist
> *resultbl
= new vector
<bufferlist
>(extents
->size());
716 auto cdata
= ceph::make_ref
<ReadCompletionData
>(this, soid
, lockCookie
, c
, bl
, extents
, resultbl
);
719 // create a completion for the unlocking of the striped object at the end of the read
720 librados::AioCompletion
*unlock_completion
=
721 librados::Rados::aio_create_completion(cdata
->get() /* create ref! */, rados_read_aio_unlock_complete
);
722 cdata
->m_unlockCompletion
= unlock_completion
;
723 // create the multiCompletion object handling the reads
724 MultiAioCompletionImplPtr nc
{new libradosstriper::MultiAioCompletionImpl
,
726 nc
->set_complete_callback(cdata
.get(), striper_read_aio_req_complete
);
727 // go through the extents
729 for (vector
<ObjectExtent
>::iterator p
= extents
->begin(); p
!= extents
->end(); ++p
) {
730 // create a buffer list describing where to place data read from current extend
731 bufferlist
*oid_bl
= &((*resultbl
)[i
++]);
732 for (vector
<pair
<uint64_t,uint64_t> >::iterator q
= p
->buffer_extents
.begin();
733 q
!= p
->buffer_extents
.end();
735 bufferlist buffer_bl
;
736 buffer_bl
.substr_of(*bl
, q
->first
, q
->second
);
737 oid_bl
->append(buffer_bl
);
739 // read all extends of a given object in one go
741 // we need 2 references on data as both rados_req_read_safe and rados_req_read_complete
743 auto data
= ceph::make_ref
<RadosReadCompletionData
>(nc
, p
->length
, oid_bl
, cct());
744 librados::AioCompletion
*rados_completion
=
745 librados::Rados::aio_create_completion(data
.detach(), rados_req_read_complete
);
746 r
= m_ioCtx
.aio_read(p
->oid
.name
, rados_completion
, oid_bl
, p
->length
, p
->offset
);
747 rados_completion
->release();
751 nc
->finish_adding_requests();
755 int libradosstriper::RadosStriperImpl::aio_read(const std::string
& soid
,
756 librados::AioCompletionImpl
*c
,
761 // create a buffer list and store it inside the completion object
763 c
->bl
.push_back(buffer::create_static(len
, buf
));
764 // call the bufferlist version of this method
765 return aio_read(soid
, c
, &c
->bl
, len
, off
);
768 int libradosstriper::RadosStriperImpl::aio_flush()
771 // pass to the rados level
772 ret
= m_ioCtx
.aio_flush();
775 //wait all CompletionData are released
776 std::unique_lock l
{lock
};
777 cond
.wait(l
, [this] {return m_refCnt
<= 1;});
781 ///////////////////////// stat and deletion /////////////////////////////
783 int libradosstriper::RadosStriperImpl::stat(const std::string
& soid
, uint64_t *psize
, time_t *pmtime
)
785 // create a completion object
786 librados::AioCompletionImpl c
;
787 // call asynchronous version of stat
788 int rc
= aio_stat(soid
, &c
, psize
, pmtime
);
790 // wait for completion of the remove
791 c
.wait_for_complete();
793 rc
= c
.get_return_value();
798 static void striper_stat_aio_stat_complete(rados_completion_t c
, void *arg
) {
799 auto data
= ceph::ref_t
<BasicStatCompletionData
>(static_cast<BasicStatCompletionData
*>(arg
), false);
800 int rc
= rados_aio_get_return_value(c
);
802 // remember this has failed
805 data
->m_multiCompletion
->complete_request(rc
);
808 static void striper_stat_aio_getxattr_complete(rados_completion_t c
, void *arg
) {
809 auto data
= ceph::ref_t
<BasicStatCompletionData
>(static_cast<BasicStatCompletionData
*>(arg
), false);
810 int rc
= rados_aio_get_return_value(c
);
811 // We need to handle the case of sparse files here
813 // remember this has failed
814 data
->m_getxattrRC
= rc
;
816 // this intermediate string allows to add a null terminator before calling strtol
818 std::string
strsize(data
->m_bl
.c_str(), data
->m_bl
.length());
819 *data
->m_psize
= strict_strtoll(strsize
.c_str(), 10, &err
);
821 lderr(data
->m_striper
->cct()) << XATTR_SIZE
<< " : " << err
<< dendl
;
822 data
->m_getxattrRC
= -EINVAL
;
826 data
->m_multiCompletion
->complete_request(rc
);
829 static void striper_stat_aio_req_complete(rados_striper_multi_completion_t c
,
831 auto data
= ceph::ref_t
<BasicStatCompletionData
>(static_cast<BasicStatCompletionData
*>(arg
), false);
832 if (data
->m_statRC
) {
833 data
->complete(data
->m_statRC
);
835 if (data
->m_getxattrRC
< 0) {
836 data
->complete(data
->m_getxattrRC
);
843 template<class TimeType
>
844 int libradosstriper::RadosStriperImpl::aio_generic_stat
845 (const std::string
& soid
,
846 librados::AioCompletionImpl
*c
,
849 typename
libradosstriper::RadosStriperImpl::StatFunction
<TimeType
>::Type statFunction
)
851 // use a MultiAioCompletion object for dealing with the fact
852 // that we'll do 2 asynchronous calls in parallel
853 MultiAioCompletionImplPtr multi_completion
{
854 new libradosstriper::MultiAioCompletionImpl
, false};
855 // Data object used for passing context to asynchronous calls
856 std::string firstObjOid
= getObjectId(soid
, 0);
857 auto cdata
= ceph::make_ref
<StatCompletionData
<TimeType
>>(this, firstObjOid
, c
, multi_completion
.get(), psize
, pmtime
);
858 multi_completion
->set_complete_callback(cdata
->get() /* create ref! */, striper_stat_aio_req_complete
);
859 // use a regular AioCompletion for the stat async call
860 librados::AioCompletion
*stat_completion
=
861 librados::Rados::aio_create_completion(cdata
->get() /* create ref! */, striper_stat_aio_stat_complete
);
862 multi_completion
->add_safe_request();
863 object_t
obj(firstObjOid
);
864 int rc
= (m_ioCtxImpl
->*statFunction
)(obj
, stat_completion
->pc
,
865 &cdata
->m_objectSize
, cdata
->m_pmtime
);
866 stat_completion
->release();
868 // nothing is really started so cancel everything
869 delete cdata
.detach();
872 // use a regular AioCompletion for the getxattr async call
873 librados::AioCompletion
*getxattr_completion
=
874 librados::Rados::aio_create_completion(cdata
->get() /* create ref! */, striper_stat_aio_getxattr_complete
);
875 multi_completion
->add_safe_request();
876 // in parallel, get the pmsize from the first object asynchronously
877 rc
= m_ioCtxImpl
->aio_getxattr(obj
, getxattr_completion
->pc
,
878 XATTR_SIZE
, cdata
->m_bl
);
879 getxattr_completion
->release();
880 multi_completion
->finish_adding_requests();
882 // the async stat is ongoing, so we need to go on
883 // we mark the getxattr as failed in the data object
884 cdata
->m_getxattrRC
= rc
;
885 multi_completion
->complete_request(rc
);
891 int libradosstriper::RadosStriperImpl::aio_stat(const std::string
& soid
,
892 librados::AioCompletionImpl
*c
,
896 return aio_generic_stat
<time_t>(soid
, c
, psize
, pmtime
, &librados::IoCtxImpl::aio_stat
);
899 int libradosstriper::RadosStriperImpl::stat2(const std::string
& soid
, uint64_t *psize
, struct timespec
*pts
)
901 // create a completion object
902 librados::AioCompletionImpl c
;
903 // call asynchronous version of stat
904 int rc
= aio_stat2(soid
, &c
, psize
, pts
);
906 // wait for completion of the remove
907 c
.wait_for_complete_and_cb();
909 rc
= c
.get_return_value();
914 int libradosstriper::RadosStriperImpl::aio_stat2(const std::string
& soid
,
915 librados::AioCompletionImpl
*c
,
917 struct timespec
*pts
)
919 return aio_generic_stat
<struct timespec
>(soid
, c
, psize
, pts
, &librados::IoCtxImpl::aio_stat2
);
922 static void rados_req_remove_complete(rados_completion_t c
, void *arg
)
924 auto cdata
= static_cast<RadosRemoveCompletionData
*>(arg
);
925 int rc
= rados_aio_get_return_value(c
);
926 // in case the object did not exist, it means we had a sparse file, all is fine
930 cdata
->m_multiAioCompl
->complete_request(rc
);
931 cdata
->m_multiAioCompl
->safe_request(rc
);
934 static void striper_remove_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
936 auto cdata
= ceph::ref_t
<RemoveCompletionData
>(static_cast<RemoveCompletionData
*>(arg
), false);
937 libradosstriper::MultiAioCompletionImpl
*comp
=
938 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
939 ldout(cdata
->m_striper
->cct(), 10)
940 << "RadosStriperImpl : striper_remove_aio_req_complete called for "
941 << cdata
->m_soid
<< dendl
;
944 // All went fine, synchronously remove first object
945 rc
= cdata
->m_striper
->m_ioCtx
.remove(cdata
->m_striper
->getObjectId(cdata
->m_soid
, 0),
948 lderr(cdata
->m_striper
->cct())
949 << "RadosStriperImpl : deletion/truncation incomplete for " << cdata
->m_soid
950 << ", as errors were encountered. The file is left present but it's content "
951 << " has been partially removed"
957 int libradosstriper::RadosStriperImpl::remove(const std::string
& soid
, int flags
)
959 // create a completion object
960 librados::AioCompletionImpl c
;
961 // call asynchronous version of remove
962 int rc
= aio_remove(soid
, &c
, flags
);
964 // wait for completion of the remove
965 c
.wait_for_complete_and_cb();
967 rc
= c
.get_return_value();
972 int libradosstriper::RadosStriperImpl::aio_remove(const std::string
& soid
,
973 librados::AioCompletionImpl
*c
,
976 // the RemoveCompletionData object will lock the given soid for the duration
978 std::string lockCookie
= getUUID();
979 int rc
= m_ioCtx
.lock_exclusive(getObjectId(soid
, 0), RADOS_LOCK_NAME
, lockCookie
, "", 0, 0);
981 // create CompletionData for the async remove call
982 auto cdata
= ceph::make_ref
<RemoveCompletionData
>(this, soid
, lockCookie
, c
, flags
);
983 MultiAioCompletionImplPtr multi_completion
{
984 new libradosstriper::MultiAioCompletionImpl
, false};
985 multi_completion
->set_complete_callback(cdata
->get() /* create ref! */, striper_remove_aio_req_complete
);
986 // call asynchronous internal version of remove
988 << "RadosStriperImpl : Aio_remove starting for "
990 rc
= internal_aio_remove(soid
, multi_completion
);
994 int libradosstriper::RadosStriperImpl::internal_aio_remove(
995 const std::string
& soid
,
996 MultiAioCompletionImplPtr multi_completion
,
999 std::string firstObjOid
= getObjectId(soid
, 0);
1001 // check size and get number of rados objects to delete
1002 uint64_t nb_objects
= 0;
1004 int rc
= getxattr(soid
, XATTR_SIZE
, bl2
);
1006 // no object size (or not able to get it)
1007 // try to find the number of object "by hand"
1010 while (!m_ioCtx
.stat(getObjectId(soid
, nb_objects
), &psize
, &pmtime
)) {
1014 // count total number of rados objects in the striped object
1016 // this intermediate string allows to add a null terminator before calling strtol
1017 std::string
strsize(bl2
.c_str(), bl2
.length());
1018 uint64_t size
= strict_strtoll(strsize
.c_str(), 10, &err
);
1020 lderr(cct()) << XATTR_SIZE
<< " : " << err
<< dendl
;
1024 uint64_t object_size
= m_layout
.fl_object_size
;
1025 uint64_t su
= m_layout
.fl_stripe_unit
;
1026 uint64_t stripe_count
= m_layout
.fl_stripe_count
;
1027 uint64_t nb_complete_sets
= size
/ (object_size
*stripe_count
);
1028 uint64_t remaining_data
= size
% (object_size
*stripe_count
);
1029 uint64_t remaining_stripe_units
= (remaining_data
+ su
-1) / su
;
1030 uint64_t remaining_objects
= std::min(remaining_stripe_units
, stripe_count
);
1031 nb_objects
= nb_complete_sets
* stripe_count
+ remaining_objects
;
1033 // delete rados objects in reverse order
1034 // Note that we do not drop the first object. This one will only be dropped
1035 // if all other removals have been successful, and this is done in the
1036 // callback of the multi_completion object
1038 for (int i
= nb_objects
-1; i
>= 1; i
--) {
1039 multi_completion
->add_request();
1040 auto data
= ceph::make_ref
<RadosRemoveCompletionData
>(multi_completion
, cct());
1041 librados::AioCompletion
*rados_completion
=
1042 librados::Rados::aio_create_completion(data
->get() /* create ref! */,
1043 rados_req_remove_complete
);
1045 rcr
= m_ioCtx
.aio_remove(getObjectId(soid
, i
), rados_completion
);
1047 rcr
= m_ioCtx
.aio_remove(getObjectId(soid
, i
), rados_completion
, flags
);
1049 rados_completion
->release();
1050 if (rcr
< 0 and -ENOENT
!= rcr
) {
1051 lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid
1052 << ", as " << getObjectId(soid
, i
) << " could not be deleted (rc=" << rc
<< ")"
1057 // we are over adding requests to the multi_completion object
1058 multi_completion
->finish_adding_requests();
1061 } catch (ErrorCode
&e
) {
1062 // error caught when trying to take the exclusive lock
1068 int libradosstriper::RadosStriperImpl::trunc(const std::string
& soid
, uint64_t size
)
1070 // lock the object in exclusive mode
1071 std::string firstObjOid
= getObjectId(soid
, 0);
1072 librados::ObjectWriteOperation op
;
1074 std::string lockCookie
= RadosStriperImpl::getUUID();
1075 utime_t dur
= utime_t();
1076 rados::cls::lock::lock(&op
, RADOS_LOCK_NAME
, ClsLockType::EXCLUSIVE
, lockCookie
, "", "", dur
, 0);
1077 int rc
= m_ioCtx
.operate(firstObjOid
, &op
);
1079 // load layout and size
1080 ceph_file_layout layout
;
1081 uint64_t original_size
;
1082 rc
= internal_get_layout_and_size(firstObjOid
, &layout
, &original_size
);
1084 if (size
< original_size
) {
1085 rc
= truncate(soid
, original_size
, size
, layout
);
1086 } else if (size
> original_size
) {
1087 rc
= grow(soid
, original_size
, size
, layout
);
1090 // unlock object, ignore return code as we cannot do much
1091 m_ioCtx
.unlock(firstObjOid
, RADOS_LOCK_NAME
, lockCookie
);
1097 ///////////////////////// private helpers /////////////////////////////
1099 std::string
libradosstriper::RadosStriperImpl::getObjectId(const object_t
& soid
,
1100 long long unsigned objectno
)
1102 std::ostringstream s
;
1103 s
<< soid
<< '.' << std::setfill ('0') << std::setw(16) << std::hex
<< objectno
;
1107 void libradosstriper::RadosStriperImpl::unlockObject(const std::string
& soid
,
1108 const std::string
& lockCookie
)
1110 // unlock the shared lock on the first rados object
1111 std::string firstObjOid
= getObjectId(soid
, 0);
1112 m_ioCtx
.unlock(firstObjOid
, RADOS_LOCK_NAME
, lockCookie
);
1115 void libradosstriper::RadosStriperImpl::aio_unlockObject(const std::string
& soid
,
1116 const std::string
& lockCookie
,
1117 librados::AioCompletion
*c
)
1119 // unlock the shared lock on the first rados object
1120 std::string firstObjOid
= getObjectId(soid
, 0);
1121 m_ioCtx
.aio_unlock(firstObjOid
, RADOS_LOCK_NAME
, lockCookie
, c
);
1124 static void rados_write_aio_unlock_complete(rados_striper_multi_completion_t c
, void *arg
)
1126 auto cdata
= ceph::ref_t
<WriteCompletionData
>(static_cast<WriteCompletionData
*>(arg
), false);
1127 libradosstriper::MultiAioCompletionImpl
*comp
=
1128 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1129 cdata
->complete_unlock(comp
->rval
);
1132 static void striper_write_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
1134 auto cdata
= ceph::ref_t
<WriteCompletionData
>(static_cast<WriteCompletionData
*>(arg
), false);
1135 // launch the async unlocking of the object
1136 cdata
->m_striper
->aio_unlockObject(cdata
->m_soid
, cdata
->m_lockCookie
, cdata
->m_unlockCompletion
);
1137 // complete the write part in parallel
1138 libradosstriper::MultiAioCompletionImpl
*comp
=
1139 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1140 cdata
->complete_write(comp
->rval
);
1143 static void striper_write_aio_req_safe(rados_striper_multi_completion_t c
, void *arg
)
1145 auto cdata
= ceph::ref_t
<WriteCompletionData
>(static_cast<WriteCompletionData
*>(arg
), false);
1146 libradosstriper::MultiAioCompletionImpl
*comp
=
1147 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1148 cdata
->safe(comp
->rval
);
1151 int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string
& soid
,
1152 const ceph_file_layout
& layout
,
1153 const std::string
& lockCookie
,
1154 const bufferlist
& bl
,
1157 // create a completion object to be passed to the callbacks of the multicompletion
1158 // we need 3 references as striper_write_aio_req_complete will release two and
1159 // striper_write_aio_req_safe will release one
1160 auto cdata
= ceph::make_ref
<WriteCompletionData
>(this, soid
, lockCookie
, nullptr);
1161 // create a completion object for the unlocking of the striped object at the end of the write
1162 librados::AioCompletion
*unlock_completion
=
1163 librados::Rados::aio_create_completion(cdata
->get() /* create ref! */, rados_write_aio_unlock_complete
);
1164 cdata
->m_unlockCompletion
= unlock_completion
;
1165 // create the multicompletion that will handle the write completion
1166 MultiAioCompletionImplPtr c
{new libradosstriper::MultiAioCompletionImpl
,
1168 c
->set_complete_callback(cdata
->get() /* create ref! */, striper_write_aio_req_complete
);
1169 c
->set_safe_callback(cdata
->get() /* create ref! */, striper_write_aio_req_safe
);
1170 // call the asynchronous API
1171 int rc
= internal_aio_write(soid
, c
, bl
, len
, off
, layout
);
1173 // wait for completion and safety of data
1174 c
->wait_for_complete_and_cb();
1175 c
->wait_for_safe_and_cb();
1176 // wait for the unlocking
1177 unlock_completion
->wait_for_complete();
1179 rc
= c
->get_return_value();
1184 int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string
& soid
,
1185 librados::AioCompletionImpl
*c
,
1186 const ceph_file_layout
& layout
,
1187 const std::string
& lockCookie
,
1188 const bufferlist
& bl
,
1191 // create a completion object to be passed to the callbacks of the multicompletion
1192 // we need 3 references as striper_write_aio_req_complete will release two and
1193 // striper_write_aio_req_safe will release one
1194 auto cdata
= ceph::make_ref
<WriteCompletionData
>(this, soid
, lockCookie
, c
);
1196 c
->io
= m_ioCtxImpl
;
1197 // create a completion object for the unlocking of the striped object at the end of the write
1198 librados::AioCompletion
*unlock_completion
=
1199 librados::Rados::aio_create_completion(cdata
->get() /* create ref! */, rados_write_aio_unlock_complete
);
1200 cdata
->m_unlockCompletion
= unlock_completion
;
1201 // create the multicompletion that will handle the write completion
1202 libradosstriper::MultiAioCompletionImplPtr nc
{
1203 new libradosstriper::MultiAioCompletionImpl
, false};
1204 nc
->set_complete_callback(cdata
->get() /* create ref! */, striper_write_aio_req_complete
);
1205 nc
->set_safe_callback(cdata
->get() /* create ref! */, striper_write_aio_req_safe
);
1206 // internal asynchronous API
1207 int rc
= internal_aio_write(soid
, nc
, bl
, len
, off
, layout
);
1211 static void rados_req_write_complete(rados_completion_t c
, void *arg
)
1213 auto comp
= reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(arg
);
1214 comp
->complete_request(rados_aio_get_return_value(c
));
1215 comp
->safe_request(rados_aio_get_return_value(c
));
1219 libradosstriper::RadosStriperImpl::internal_aio_write(const std::string
& soid
,
1220 libradosstriper::MultiAioCompletionImplPtr c
,
1221 const bufferlist
& bl
,
1224 const ceph_file_layout
& layout
)
1227 // Do not try anything if we are called with empty buffer,
1228 // file_to_extents would raise an exception
1230 // get list of extents to be written to
1231 vector
<ObjectExtent
> extents
;
1232 std::string format
= soid
;
1233 boost::replace_all(format
, "%", "%%");
1234 format
+= RADOS_OBJECT_EXTENSION_FORMAT
;
1236 l
.from_legacy(layout
);
1237 Striper::file_to_extents(cct(), format
.c_str(), &l
, off
, len
, 0, extents
);
1238 // go through the extents
1239 for (vector
<ObjectExtent
>::iterator p
= extents
.begin(); p
!= extents
.end(); ++p
) {
1240 // assemble pieces of a given object into a single buffer list
1242 for (vector
<pair
<uint64_t,uint64_t> >::iterator q
= p
->buffer_extents
.begin();
1243 q
!= p
->buffer_extents
.end();
1245 bufferlist buffer_bl
;
1246 buffer_bl
.substr_of(bl
, q
->first
, q
->second
);
1247 oid_bl
.append(buffer_bl
);
1249 // and write the object
1251 librados::AioCompletion
*rados_completion
=
1252 librados::Rados::aio_create_completion(c
.get(),
1253 rados_req_write_complete
);
1254 r
= m_ioCtx
.aio_write(p
->oid
.name
, rados_completion
, oid_bl
,
1255 p
->length
, p
->offset
);
1256 rados_completion
->release();
1261 c
->finish_adding_requests();
1265 int libradosstriper::RadosStriperImpl::extract_uint32_attr
1266 (std::map
<std::string
, bufferlist
> &attrs
,
1267 const std::string
& key
,
1270 std::map
<std::string
, bufferlist
>::iterator attrsIt
= attrs
.find(key
);
1271 if (attrsIt
!= attrs
.end()) {
1272 // this intermediate string allows to add a null terminator before calling strtol
1273 std::string
strvalue(attrsIt
->second
.c_str(), attrsIt
->second
.length());
1275 *value
= strict_strtol(strvalue
.c_str(), 10, &err
);
1277 lderr(cct()) << key
<< " : " << err
<< dendl
;
1286 int libradosstriper::RadosStriperImpl::extract_sizet_attr
1287 (std::map
<std::string
, bufferlist
> &attrs
,
1288 const std::string
& key
,
1291 std::map
<std::string
, bufferlist
>::iterator attrsIt
= attrs
.find(key
);
1292 if (attrsIt
!= attrs
.end()) {
1293 // this intermediate string allows to add a null terminator before calling strtol
1294 std::string
strvalue(attrsIt
->second
.c_str(), attrsIt
->second
.length());
1296 *value
= strict_strtoll(strvalue
.c_str(), 10, &err
);
1298 lderr(cct()) << key
<< " : " << err
<< dendl
;
1307 int libradosstriper::RadosStriperImpl::internal_get_layout_and_size(
1308 const std::string
& oid
,
1309 ceph_file_layout
*layout
,
1312 // get external attributes of the first rados object
1313 std::map
<std::string
, bufferlist
> attrs
;
1314 int rc
= m_ioCtx
.getxattrs(oid
, attrs
);
1316 // deal with stripe_unit
1317 rc
= extract_uint32_attr(attrs
, XATTR_LAYOUT_STRIPE_UNIT
, &layout
->fl_stripe_unit
);
1319 // deal with stripe_count
1320 rc
= extract_uint32_attr(attrs
, XATTR_LAYOUT_STRIPE_COUNT
, &layout
->fl_stripe_count
);
1322 // deal with object_size
1323 rc
= extract_uint32_attr(attrs
, XATTR_LAYOUT_OBJECT_SIZE
, &layout
->fl_object_size
);
1327 rc
= extract_sizet_attr(attrs
, XATTR_SIZE
, &ssize
);
1332 // make valgrind happy by setting unused fl_pg_pool
1333 layout
->fl_pg_pool
= 0;
1337 int libradosstriper::RadosStriperImpl::openStripedObjectForRead(
1338 const std::string
& soid
,
1339 ceph_file_layout
*layout
,
1341 std::string
*lockCookie
)
1343 // take a lock the first rados object, if it exists and gets its size
1344 // check, lock and size reading must be atomic and are thus done within a single operation
1345 librados::ObjectWriteOperation op
;
1347 *lockCookie
= getUUID();
1348 utime_t dur
= utime_t();
1349 rados::cls::lock::lock(&op
, RADOS_LOCK_NAME
, ClsLockType::SHARED
, *lockCookie
, "Tag", "", dur
, 0);
1350 std::string firstObjOid
= getObjectId(soid
, 0);
1351 int rc
= m_ioCtx
.operate(firstObjOid
, &op
);
1353 // error case (including -ENOENT)
1356 rc
= internal_get_layout_and_size(firstObjOid
, layout
, size
);
1358 unlockObject(soid
, *lockCookie
);
1359 lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : "
1360 << "could not load layout and size for "
1361 << soid
<< " : rc = " << rc
<< dendl
;
1366 int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string
& soid
,
1367 ceph_file_layout
*layout
,
1369 std::string
*lockCookie
,
1370 bool isFileSizeAbsolute
)
1372 // take a lock the first rados object, if it exists
1373 // check and lock must be atomic and are thus done within a single operation
1374 librados::ObjectWriteOperation op
;
1376 *lockCookie
= getUUID();
1377 utime_t dur
= utime_t();
1378 rados::cls::lock::lock(&op
, RADOS_LOCK_NAME
, ClsLockType::SHARED
, *lockCookie
, "Tag", "", dur
, 0);
1379 std::string firstObjOid
= getObjectId(soid
, 0);
1380 int rc
= m_ioCtx
.operate(firstObjOid
, &op
);
1382 if (rc
== -ENOENT
) {
1383 // object does not exist, delegate to createEmptyStripedObject
1384 int rc
= createAndOpenStripedObject(soid
, layout
, *size
, lockCookie
, isFileSizeAbsolute
);
1385 // return original size
1394 rc
= internal_get_layout_and_size(firstObjOid
, layout
, &curSize
);
1396 unlockObject(soid
, *lockCookie
);
1397 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1398 << "could not load layout and size for "
1399 << soid
<< " : rc = " << rc
<< dendl
;
1402 // atomically update object size, only if smaller than current one
1403 if (!isFileSizeAbsolute
)
1405 librados::ObjectWriteOperation writeOp
;
1406 writeOp
.cmpxattr(XATTR_SIZE
, LIBRADOS_CMPXATTR_OP_GT
, *size
);
1407 std::ostringstream oss
;
1410 bl
.append(oss
.str());
1411 writeOp
.setxattr(XATTR_SIZE
, bl
);
1412 rc
= m_ioCtx
.operate(firstObjOid
, &writeOp
);
1413 // return current size
1415 // handle case where objectsize is already bigger than size
1416 if (-ECANCELED
== rc
)
1419 unlockObject(soid
, *lockCookie
);
1420 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1421 << "could not set new size for "
1422 << soid
<< " : rc = " << rc
<< dendl
;
1427 int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string
& soid
,
1428 ceph_file_layout
*layout
,
1430 std::string
*lockCookie
,
1431 bool isFileSizeAbsolute
)
1433 // build atomic write operation
1434 librados::ObjectWriteOperation writeOp
;
1435 writeOp
.create(true);
1437 std::ostringstream oss_object_size
;
1438 oss_object_size
<< m_layout
.fl_object_size
;
1439 bufferlist bl_object_size
;
1440 bl_object_size
.append(oss_object_size
.str());
1441 writeOp
.setxattr(XATTR_LAYOUT_OBJECT_SIZE
, bl_object_size
);
1443 std::ostringstream oss_stripe_unit
;
1444 oss_stripe_unit
<< m_layout
.fl_stripe_unit
;
1445 bufferlist bl_stripe_unit
;
1446 bl_stripe_unit
.append(oss_stripe_unit
.str());
1447 writeOp
.setxattr(XATTR_LAYOUT_STRIPE_UNIT
, bl_stripe_unit
);
1449 std::ostringstream oss_stripe_count
;
1450 oss_stripe_count
<< m_layout
.fl_stripe_count
;
1451 bufferlist bl_stripe_count
;
1452 bl_stripe_count
.append(oss_stripe_count
.str());
1453 writeOp
.setxattr(XATTR_LAYOUT_STRIPE_COUNT
, bl_stripe_count
);
1455 std::ostringstream oss_size
;
1456 oss_size
<< (isFileSizeAbsolute
?size
:0);
1458 bl_size
.append(oss_size
.str());
1459 writeOp
.setxattr(XATTR_SIZE
, bl_size
);
1460 // effectively change attributes
1461 std::string firstObjOid
= getObjectId(soid
, 0);
1462 int rc
= m_ioCtx
.operate(firstObjOid
, &writeOp
);
1463 // in case of error (but no EEXIST which would mean the object existed), return
1464 if (rc
&& -EEXIST
!= rc
) return rc
;
1465 // Otherwise open the object
1466 uint64_t fileSize
= size
;
1467 return openStripedObjectForWrite(soid
, layout
, &fileSize
, lockCookie
, isFileSizeAbsolute
);
1470 static void striper_truncate_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
1472 auto cdata
= ceph::ref_t
<TruncateCompletionData
>(static_cast<TruncateCompletionData
*>(arg
), false);
1473 libradosstriper::MultiAioCompletionImpl
*comp
=
1474 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1475 if (0 == comp
->rval
) {
1476 // all went fine, change size in the external attributes
1477 std::ostringstream oss
;
1478 oss
<< cdata
->m_size
;
1480 bl
.append(oss
.str());
1481 cdata
->m_striper
->setxattr(cdata
->m_soid
, XATTR_SIZE
, bl
);
1485 int libradosstriper::RadosStriperImpl::truncate(const std::string
& soid
,
1486 uint64_t original_size
,
1488 ceph_file_layout
&layout
)
1490 auto cdata
= ceph::make_ref
<TruncateCompletionData
>(this, soid
, size
);
1491 libradosstriper::MultiAioCompletionImplPtr multi_completion
{
1492 new libradosstriper::MultiAioCompletionImpl
, false};
1493 multi_completion
->set_complete_callback(cdata
->get() /* create ref! */, striper_truncate_aio_req_complete
);
1494 // call asynchrous version of truncate
1495 int rc
= aio_truncate(soid
, multi_completion
, original_size
, size
, layout
);
1496 // wait for completion of the truncation
1497 multi_completion
->finish_adding_requests();
1498 multi_completion
->wait_for_complete_and_cb();
1501 rc
= multi_completion
->get_return_value();
1506 int libradosstriper::RadosStriperImpl::aio_truncate
1507 (const std::string
& soid
,
1508 libradosstriper::MultiAioCompletionImplPtr multi_completion
,
1509 uint64_t original_size
,
1511 ceph_file_layout
&layout
)
1513 // handle the underlying rados objects. 3 cases here :
1514 // -- the objects belonging to object sets entirely located
1515 // before the truncation are unchanged
1516 // -- the objects belonging to the object set where the
1517 // truncation took place are truncated or removed
1518 // -- the objects belonging to object sets entirely located
1519 // after the truncation are removed
1520 // Note that we do it backward and that we change the size in
1521 // the external attributes only at the end. This make sure that
1522 // no rados object stays behind if we remove the striped object
1523 // after a truncation has failed
1524 uint64_t trunc_objectsetno
= size
/ layout
.fl_object_size
/ layout
.fl_stripe_count
;
1525 uint64_t last_objectsetno
= original_size
/ layout
.fl_object_size
/ layout
.fl_stripe_count
;
1526 bool exists
= false;
1527 for (int64_t objectno
= (last_objectsetno
+1) * layout
.fl_stripe_count
-1;
1528 objectno
>= (int64_t)((trunc_objectsetno
+ 1) * layout
.fl_stripe_count
);
1530 // if no object existed so far, check object existence
1532 uint64_t nb_full_object_set
= objectno
/ layout
.fl_stripe_count
;
1533 uint64_t object_index_in_set
= objectno
% layout
.fl_stripe_count
;
1534 uint64_t set_start_off
= nb_full_object_set
* layout
.fl_object_size
* layout
.fl_stripe_count
;
1535 uint64_t object_start_off
= set_start_off
+ object_index_in_set
* layout
.fl_stripe_unit
;
1536 exists
= (original_size
> object_start_off
);
1539 // remove asynchronously
1540 multi_completion
->add_request();
1541 auto data
= ceph::make_ref
<RadosRemoveCompletionData
>(multi_completion
, cct());
1542 librados::AioCompletion
*rados_completion
=
1543 librados::Rados::aio_create_completion(data
->get() /* create ref! */,
1544 rados_req_remove_complete
);
1545 int rc
= m_ioCtx
.aio_remove(getObjectId(soid
, objectno
), rados_completion
);
1546 rados_completion
->release();
1547 // in case the object did not exist, it means we had a sparse file, all is fine
1548 if (rc
&& rc
!= -ENOENT
) return rc
;
1551 for (int64_t objectno
= ((trunc_objectsetno
+ 1) * layout
.fl_stripe_count
) -1;
1552 objectno
>= (int64_t)(trunc_objectsetno
* layout
.fl_stripe_count
);
1554 // if no object existed so far, check object existence
1556 uint64_t object_start_off
= ((objectno
/ layout
.fl_stripe_count
) * layout
.fl_object_size
) +
1557 ((objectno
% layout
.fl_stripe_count
) * layout
.fl_stripe_unit
);
1558 exists
= (original_size
> object_start_off
);
1563 l
.from_legacy(layout
);
1564 uint64_t new_object_size
= Striper::object_truncate_size(cct(), &l
, objectno
, size
);
1566 if (new_object_size
> 0 or 0 == objectno
) {
1567 // trunc is synchronous as there is no async version
1568 // but note that only a single object will be truncated
1569 // reducing the overload to a fixed amount
1570 rc
= m_ioCtx
.trunc(getObjectId(soid
, objectno
), new_object_size
);
1572 // removes are asynchronous in order to speed up truncations of big files
1573 multi_completion
->add_request();
1574 auto data
= ceph::make_ref
<RadosRemoveCompletionData
>(multi_completion
, cct());
1575 librados::AioCompletion
*rados_completion
=
1576 librados::Rados::aio_create_completion(data
->get() /* create ref! */,
1577 rados_req_remove_complete
);
1578 rc
= m_ioCtx
.aio_remove(getObjectId(soid
, objectno
), rados_completion
);
1579 rados_completion
->release();
1581 // in case the object did not exist, it means we had a sparse file, all is fine
1582 if (rc
&& rc
!= -ENOENT
) return rc
;
1588 int libradosstriper::RadosStriperImpl::grow(const std::string
& soid
,
1589 uint64_t original_size
,
1591 ceph_file_layout
&layout
)
1593 // handle the underlying rados objects. As we support sparse objects,
1594 // we only have to change the size in the external attributes
1595 std::ostringstream oss
;
1598 bl
.append(oss
.str());
1599 int rc
= m_ioCtx
.setxattr(getObjectId(soid
, 0), XATTR_SIZE
, bl
);
1603 std::string
libradosstriper::RadosStriperImpl::getUUID()
1606 uuid
.generate_random();
1609 return std::string(suuid
);