1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/algorithm/string/replace.hpp>
17 #include "libradosstriper/RadosStriperImpl.h"
25 #include "include/types.h"
26 #include "include/uuid.h"
27 #include "include/ceph_fs.h"
28 #include "common/dout.h"
29 #include "common/strtol.h"
30 #include "common/RefCountedObj.h"
31 #include "osdc/Striper.h"
32 #include "librados/AioCompletionImpl.h"
33 #include <cls/lock/cls_lock_client.h>
36 * This file contents the actual implementation of the rados striped objects interface.
38 * Striped objects are stored in rados in a set of regular rados objects, after their
39 * content has been striped using the osdc/Striper interface.
41 * The external attributes of the striped object are mapped to the attributes of the
42 * first underlying object. This first object has a set of extra external attributes
43 * storing the layout of the striped object for future read back. These attributes are :
44 * - striper.layout.object_size : the size of rados objects used.
45 * Must be a multiple of striper.layout.stripe_unit
46 * - striper.layout.stripe_unit : the size of a stripe unit
47 * - striper.layout.stripe_count : the number of stripes used
48 * - striper.size : total striped object size
50 * In general operations on striped objects are not atomic.
51 * However, a certain number of safety guards have been put to make the interface closer
53 * - each data operation takes a shared lock on the first rados object for the
54 * whole time of the operation
55 * - the remove and trunc operations take an exclusive lock on the first rados object
56 * for the whole time of the operation
57 * This makes sure that no removal/truncation of a striped object occurs while
58 * data operations are happening and vice versa. It thus makes sure that the layout
59 * of a striped object does not change during data operation, which is essential for
62 * Still the writing to a striped object is not atomic. This means in particular that
63 * the size of an object may not be in sync with its content at all times.
64 * As the size is always guaranteed to be updated first and in an atomic way, and as
65 * sparse striped objects are supported (see below), what will typically happen is
66 * that a reader that comes too soon after a write will read 0s instead of the actual
69 * Note that remove handles the pieces of the striped object in reverse order,
70 * so that the head object is removed last, making the completion of the deletion atomic.
72 * Striped objects can be sparse, typically in case data was written at the end of the
73 * striped object only. In such a case, some rados objects constituing the striped object
74 * may be missing. Other can be partial (only the beginning will have data)
75 * When dealing with such sparse striped files, missing objects are detected and
76 * considered as full of 0s. They are however not created until real data is written
79 * There are a number of missing features/improvements that could be implemented.
80 * Here are some ideas :
81 * - implementation of missing entry points (compared to rados)
82 * In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ...
86 #define dout_subsys ceph_subsys_rados
88 #define dout_prefix *_dout << "libradosstriper: "
90 /// size of xattr buffer
91 #define XATTR_BUFFER_SIZE 32
93 /// names of the different xattr entries
94 #define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit"
95 #define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count"
96 #define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size"
97 #define XATTR_SIZE "striper.size"
98 #define LOCK_PREFIX "lock."
100 /// name of the lock used on objects to ensure layout stability during IO
101 #define RADOS_LOCK_NAME "striper.lock"
103 /// format of the extension of rados objects created for a given striped object
104 #define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx"
106 /// default object layout
107 struct ceph_file_layout default_file_layout
= {
108 init_le32(1<<22), // fl_stripe_unit
109 init_le32(1), // fl_stripe_count
110 init_le32(1<<22), // fl_object_size
111 init_le32(0), // fl_cas_hash
112 init_le32(0), // fl_object_stripe_unit
113 init_le32(-1), // fl_unused
114 init_le32(-1), // fl_pg_pool
117 using libradosstriper::MultiAioCompletionImplPtr
;
121 ///////////////////////// CompletionData /////////////////////////////
124 * struct handling the data needed to pass to the call back
125 * function in asynchronous operations
127 struct CompletionData
: RefCountedObject
{
129 void complete(int r
);
130 /// striper to be used to handle the write completion
131 libradosstriper::RadosStriperImpl
*m_striper
;
132 /// striped object concerned by the write operation
134 /// shared lock to be released at completion
135 std::string m_lockCookie
;
136 /// completion handler
137 librados::IoCtxImpl::C_aio_Complete
*m_ack
;
139 CompletionData(libradosstriper::RadosStriperImpl
* striper
,
140 const std::string
& soid
,
141 const std::string
& lockCookie
,
142 librados::AioCompletionImpl
*userCompletion
= 0);
143 ~CompletionData() override
;
147 CompletionData::CompletionData
148 (libradosstriper::RadosStriperImpl
* striper
,
149 const std::string
& soid
,
150 const std::string
& lockCookie
,
151 librados::AioCompletionImpl
*userCompletion
) :
152 RefCountedObject(striper
->cct()),
153 m_striper(striper
), m_soid(soid
), m_lockCookie(lockCookie
), m_ack(0) {
155 if (userCompletion
) {
156 m_ack
= new librados::IoCtxImpl::C_aio_Complete(userCompletion
);
157 userCompletion
->io
= striper
->m_ioCtxImpl
;
161 CompletionData::~CompletionData() {
162 if (m_ack
) delete m_ack
;
166 void CompletionData::complete(int r
) {
167 if (m_ack
) m_ack
->finish(r
);
171 * struct handling the data needed to pass to the call back
172 * function in asynchronous read operations
174 struct ReadCompletionData
: CompletionData
{
175 /// bufferlist containing final result
177 /// extents that will be read
178 std::vector
<ObjectExtent
>* m_extents
;
179 /// intermediate results
180 std::vector
<bufferlist
>* m_resultbl
;
181 /// return code of read completion, to be remembered until unlocking happened
183 /// completion object for the unlocking of the striped object at the end of the read
184 librados::AioCompletion
*m_unlockCompletion
;
185 /// complete method for when reading is over
186 void complete_read(int r
);
187 /// complete method for when object is unlocked
188 void complete_unlock(int r
);
191 FRIEND_MAKE_REF(ReadCompletionData
);
192 ReadCompletionData(libradosstriper::RadosStriperImpl
* striper
,
193 const std::string
& soid
,
194 const std::string
& lockCookie
,
195 librados::AioCompletionImpl
*userCompletion
,
197 std::vector
<ObjectExtent
>* extents
,
198 std::vector
<bufferlist
>* resultbl
);
199 ~ReadCompletionData() override
;
202 ReadCompletionData::ReadCompletionData
203 (libradosstriper::RadosStriperImpl
* striper
,
204 const std::string
& soid
,
205 const std::string
& lockCookie
,
206 librados::AioCompletionImpl
*userCompletion
,
208 std::vector
<ObjectExtent
>* extents
,
209 std::vector
<bufferlist
>* resultbl
) :
210 CompletionData(striper
, soid
, lockCookie
, userCompletion
),
211 m_bl(bl
), m_extents(extents
), m_resultbl(resultbl
), m_readRc(0),
212 m_unlockCompletion(0) {}
214 ReadCompletionData::~ReadCompletionData() {
215 m_unlockCompletion
->release();
220 void ReadCompletionData::complete_read(int r
) {
221 // gather data into final buffer
222 Striper::StripedReadResult readResult
;
223 vector
<bufferlist
>::iterator bit
= m_resultbl
->begin();
224 for (vector
<ObjectExtent
>::iterator eit
= m_extents
->begin();
225 eit
!= m_extents
->end();
227 readResult
.add_partial_result(m_striper
->cct(), *bit
, eit
->buffer_extents
);
230 readResult
.assemble_result(m_striper
->cct(), *m_bl
, true);
231 // Remember return code
235 void ReadCompletionData::complete_unlock(int r
) {
236 // call parent's completion method
237 // Note that we ignore the return code of the unlock as we cannot do much about it
238 CompletionData::complete(m_readRc
?m_readRc
:m_bl
->length());
242 * struct handling the data needed to pass to the call back
243 * function in asynchronous write operations
245 struct WriteCompletionData
: CompletionData
{
246 /// safe completion handler
247 librados::IoCtxImpl::C_aio_Complete
*m_safe
;
248 /// completion object for the unlocking of the striped object at the end of the write
249 librados::AioCompletion
*m_unlockCompletion
;
250 /// return code of write completion, to be remembered until unlocking happened
252 /// complete method for when writing is over
253 void complete_write(int r
);
254 /// complete method for when object is unlocked
255 void complete_unlock(int r
);
259 FRIEND_MAKE_REF(WriteCompletionData
);
261 WriteCompletionData(libradosstriper::RadosStriperImpl
* striper
,
262 const std::string
& soid
,
263 const std::string
& lockCookie
,
264 librados::AioCompletionImpl
*userCompletion
);
266 ~WriteCompletionData() override
;
269 WriteCompletionData::WriteCompletionData
270 (libradosstriper::RadosStriperImpl
* striper
,
271 const std::string
& soid
,
272 const std::string
& lockCookie
,
273 librados::AioCompletionImpl
*userCompletion
) :
274 CompletionData(striper
, soid
, lockCookie
, userCompletion
),
275 m_safe(0), m_unlockCompletion(0), m_writeRc(0) {
276 if (userCompletion
) {
277 m_safe
= new librados::IoCtxImpl::C_aio_Complete(userCompletion
);
281 WriteCompletionData::~WriteCompletionData() {
282 m_unlockCompletion
->release();
283 if (m_safe
) delete m_safe
;
286 void WriteCompletionData::complete_unlock(int r
) {
287 // call parent's completion method
288 // Note that we ignore the return code of the unlock as we cannot do much about it
289 CompletionData::complete(m_writeRc
);
292 void WriteCompletionData::complete_write(int r
) {
293 // Remember return code
297 void WriteCompletionData::safe(int r
) {
298 if (m_safe
) m_safe
->finish(r
);
301 struct RemoveCompletionData
: CompletionData
{
306 FRIEND_MAKE_REF(RemoveCompletionData
);
309 * note that the constructed object will take ownership of the lock
311 RemoveCompletionData(libradosstriper::RadosStriperImpl
* striper
,
312 const std::string
& soid
,
313 const std::string
& lockCookie
,
314 librados::AioCompletionImpl
*userCompletion
,
316 CompletionData(striper
, soid
, lockCookie
, userCompletion
), flags(flags
) {}
320 * struct handling the data needed to pass to the call back
321 * function in asynchronous truncate operations
323 struct TruncateCompletionData
: RefCountedObject
{
324 /// striper to be used
325 libradosstriper::RadosStriperImpl
*m_striper
;
326 /// striped object concerned by the truncate operation
328 /// the final size of the truncated object
332 FRIEND_MAKE_REF(TruncateCompletionData
);
334 TruncateCompletionData(libradosstriper::RadosStriperImpl
* striper
,
335 const std::string
& soid
,
337 RefCountedObject(striper
->cct()),
338 m_striper(striper
), m_soid(soid
), m_size(size
) {
342 ~TruncateCompletionData() override
{
348 * struct handling the data needed to pass to the call back
349 * function in asynchronous read operations of a Rados File
351 struct RadosReadCompletionData
: RefCountedObject
{
352 /// the multi asynch io completion object to be used
353 MultiAioCompletionImplPtr m_multiAioCompl
;
354 /// the expected number of bytes
355 uint64_t m_expectedBytes
;
356 /// the bufferlist object where data have been written
360 FRIEND_MAKE_REF(RadosReadCompletionData
);
362 RadosReadCompletionData(MultiAioCompletionImplPtr multiAioCompl
,
363 uint64_t expectedBytes
,
365 CephContext
*context
) :
366 RefCountedObject(context
),
367 m_multiAioCompl(multiAioCompl
), m_expectedBytes(expectedBytes
), m_bl(bl
) {}
371 * struct handling (most of) the data needed to pass to the call back
372 * function in asynchronous stat operations.
373 * Inherited by the actual type for adding time information in different
374 * versions (time_t or struct timespec)
376 struct BasicStatCompletionData
: CompletionData
{
377 // MultiAioCompletionImpl used to handle the double aysnc
378 // call in the back (stat + getxattr)
379 libradosstriper::MultiAioCompletionImpl
*m_multiCompletion
;
380 // where to store the size of first objct
381 // this will be ignored but we need a place to store it when
382 // async stat is called
383 uint64_t m_objectSize
;
384 // where to store the file size
386 /// the bufferlist object used for the getxattr call
388 /// return code of the stat
390 /// return code of the getxattr
395 BasicStatCompletionData(libradosstriper::RadosStriperImpl
* striper
,
396 const std::string
& soid
,
397 librados::AioCompletionImpl
*userCompletion
,
398 libradosstriper::MultiAioCompletionImpl
*multiCompletion
,
400 CompletionData(striper
, soid
, "", userCompletion
),
401 m_multiCompletion(multiCompletion
), m_psize(psize
),
402 m_statRC(0), m_getxattrRC(0) {};
407 * struct handling the data needed to pass to the call back
408 * function in asynchronous stat operations.
409 * Simple templated extension of BasicStatCompletionData.
410 * The template parameter is the type of the time information
411 * (used with time_t for stat and struct timespec for stat2)
413 template<class TimeType
>
414 struct StatCompletionData
: BasicStatCompletionData
{
415 // where to store the file time
418 FRIEND_MAKE_REF(StatCompletionData
);
420 StatCompletionData
<TimeType
>(libradosstriper::RadosStriperImpl
* striper
,
421 const std::string
& soid
,
422 librados::AioCompletionImpl
*userCompletion
,
423 libradosstriper::MultiAioCompletionImpl
*multiCompletion
,
426 BasicStatCompletionData(striper
, soid
, userCompletion
, multiCompletion
, psize
),
431 * struct handling the data needed to pass to the call back
432 * function in asynchronous remove operations of a Rados File
434 struct RadosRemoveCompletionData
: RefCountedObject
{
435 /// the multi asynch io completion object to be used
436 MultiAioCompletionImplPtr m_multiAioCompl
;
438 FRIEND_MAKE_REF(RadosRemoveCompletionData
);
440 RadosRemoveCompletionData(MultiAioCompletionImplPtr multiAioCompl
,
441 CephContext
*context
) :
442 RefCountedObject(context
),
443 m_multiAioCompl(multiAioCompl
) {};
449 ///////////////////////// constructor /////////////////////////////
451 libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx
& ioctx
, librados::IoCtxImpl
*ioctx_impl
) :
452 m_refCnt(0), m_radosCluster(ioctx
), m_ioCtx(ioctx
), m_ioCtxImpl(ioctx_impl
),
453 m_layout(default_file_layout
) {}
455 ///////////////////////// layout /////////////////////////////
457 int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit
458 (unsigned int stripe_unit
)
460 /* stripe unit must be non-zero, 64k increment */
461 if (!stripe_unit
|| (stripe_unit
& (CEPH_MIN_STRIPE_UNIT
-1)))
463 m_layout
.fl_stripe_unit
= stripe_unit
;
467 int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount
468 (unsigned int stripe_count
)
470 /* stripe count must be non-zero */
473 m_layout
.fl_stripe_count
= stripe_count
;
477 int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize
478 (unsigned int object_size
)
480 /* object size must be non-zero, 64k increment */
481 if (!object_size
|| (object_size
& (CEPH_MIN_STRIPE_UNIT
-1)))
483 /* object size must be a multiple of stripe unit */
484 if (object_size
< m_layout
.fl_stripe_unit
||
485 object_size
% m_layout
.fl_stripe_unit
)
487 m_layout
.fl_object_size
= object_size
;
491 ///////////////////////// xattrs /////////////////////////////
493 int libradosstriper::RadosStriperImpl::getxattr(const object_t
& soid
,
497 std::string firstObjOid
= getObjectId(soid
, 0);
498 return m_ioCtx
.getxattr(firstObjOid
, name
, bl
);
501 int libradosstriper::RadosStriperImpl::setxattr(const object_t
& soid
,
505 std::string firstObjOid
= getObjectId(soid
, 0);
506 return m_ioCtx
.setxattr(firstObjOid
, name
, bl
);
509 int libradosstriper::RadosStriperImpl::getxattrs(const object_t
& soid
,
510 map
<string
, bufferlist
>& attrset
)
512 std::string firstObjOid
= getObjectId(soid
, 0);
513 int rc
= m_ioCtx
.getxattrs(firstObjOid
, attrset
);
515 // cleanup internal attributes dedicated to striping and locking
516 attrset
.erase(XATTR_LAYOUT_STRIPE_UNIT
);
517 attrset
.erase(XATTR_LAYOUT_STRIPE_COUNT
);
518 attrset
.erase(XATTR_LAYOUT_OBJECT_SIZE
);
519 attrset
.erase(XATTR_SIZE
);
520 attrset
.erase(std::string(LOCK_PREFIX
) + RADOS_LOCK_NAME
);
524 int libradosstriper::RadosStriperImpl::rmxattr(const object_t
& soid
,
527 std::string firstObjOid
= getObjectId(soid
, 0);
528 return m_ioCtx
.rmxattr(firstObjOid
, name
);
531 ///////////////////////// io /////////////////////////////
533 int libradosstriper::RadosStriperImpl::write(const std::string
& soid
,
534 const bufferlist
& bl
,
538 // open the object. This will create it if needed, retrieve its layout
539 // and size and take a shared lock on it
540 ceph_file_layout layout
;
541 std::string lockCookie
;
542 int rc
= createAndOpenStripedObject(soid
, &layout
, len
+off
, &lockCookie
, true);
544 return write_in_open_object(soid
, layout
, lockCookie
, bl
, len
, off
);
547 int libradosstriper::RadosStriperImpl::append(const std::string
& soid
,
548 const bufferlist
& bl
,
551 // open the object. This will create it if needed, retrieve its layout
552 // and size and take a shared lock on it
553 ceph_file_layout layout
;
555 std::string lockCookie
;
556 int rc
= openStripedObjectForWrite(soid
, &layout
, &size
, &lockCookie
, false);
558 return write_in_open_object(soid
, layout
, lockCookie
, bl
, len
, size
);
561 int libradosstriper::RadosStriperImpl::write_full(const std::string
& soid
,
562 const bufferlist
& bl
)
564 int rc
= trunc(soid
, 0);
565 if (rc
&& rc
!= -ENOENT
) return rc
; // ENOENT is obviously ok
566 return write(soid
, bl
, bl
.length(), 0);
569 int libradosstriper::RadosStriperImpl::read(const std::string
& soid
,
574 // create a completion object
575 librados::AioCompletionImpl c
;
576 // call asynchronous method
577 int rc
= aio_read(soid
, &c
, bl
, len
, off
);
578 // and wait for completion
580 // wait for completion
581 c
.wait_for_complete_and_cb();
583 rc
= c
.get_return_value();
588 ///////////////////////// asynchronous io /////////////////////////////
590 int libradosstriper::RadosStriperImpl::aio_write(const std::string
& soid
,
591 librados::AioCompletionImpl
*c
,
592 const bufferlist
& bl
,
596 ceph_file_layout layout
;
597 std::string lockCookie
;
598 int rc
= createAndOpenStripedObject(soid
, &layout
, len
+off
, &lockCookie
, true);
600 return aio_write_in_open_object(soid
, c
, layout
, lockCookie
, bl
, len
, off
);
603 int libradosstriper::RadosStriperImpl::aio_append(const std::string
& soid
,
604 librados::AioCompletionImpl
*c
,
605 const bufferlist
& bl
,
608 ceph_file_layout layout
;
610 std::string lockCookie
;
611 int rc
= openStripedObjectForWrite(soid
, &layout
, &size
, &lockCookie
, false);
613 // create a completion object
614 return aio_write_in_open_object(soid
, c
, layout
, lockCookie
, bl
, len
, size
);
617 int libradosstriper::RadosStriperImpl::aio_write_full(const std::string
& soid
,
618 librados::AioCompletionImpl
*c
,
619 const bufferlist
& bl
)
621 int rc
= trunc(soid
, 0);
623 return aio_write(soid
, c
, bl
, bl
.length(), 0);
626 static void rados_read_aio_unlock_complete(rados_striper_multi_completion_t c
, void *arg
)
628 auto cdata
= ceph::ref_t
<ReadCompletionData
>(static_cast<ReadCompletionData
*>(arg
), false);
629 libradosstriper::MultiAioCompletionImpl
*comp
=
630 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
631 cdata
->complete_unlock(comp
->rval
);
634 static void striper_read_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
636 auto cdata
= static_cast<ReadCompletionData
*>(arg
);
637 // launch the async unlocking of the object
638 cdata
->m_striper
->aio_unlockObject(cdata
->m_soid
, cdata
->m_lockCookie
, cdata
->m_unlockCompletion
);
639 // complete the read part in parallel
640 libradosstriper::MultiAioCompletionImpl
*comp
=
641 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
642 cdata
->complete_read(comp
->rval
);
645 static void rados_req_read_complete(rados_completion_t c
, void *arg
)
647 auto data
= static_cast<RadosReadCompletionData
*>(arg
);
648 int rc
= rados_aio_get_return_value(c
);
649 // We need to handle the case of sparse files here
651 // the object did not exist at all. This can happen for sparse files.
652 // we consider we've read 0 bytes and it will fall into next case
656 if (rc
>= 0 && (((uint64_t)rc
) < data
->m_expectedBytes
)) {
657 // only partial data were present in the object (or the object did not
658 // even exist if we've gone through previous case).
659 // This is typical of sparse file and we need to complete with 0s.
660 unsigned int lenOfZeros
= data
->m_expectedBytes
-rc
;
661 unsigned int existingDataToZero
= min(data
->m_bl
->length()-rc
, lenOfZeros
);
662 if (existingDataToZero
> 0) {
663 data
->m_bl
->zero(rc
, existingDataToZero
);
665 if (lenOfZeros
> existingDataToZero
) {
666 ceph::bufferptr
zeros(ceph::buffer::create(lenOfZeros
-existingDataToZero
));
668 data
->m_bl
->push_back(zeros
);
670 nread
= data
->m_expectedBytes
;
672 auto multi_aio_comp
= data
->m_multiAioCompl
;
673 multi_aio_comp
->complete_request(nread
);
674 multi_aio_comp
->safe_request(rc
);
677 int libradosstriper::RadosStriperImpl::aio_read(const std::string
& soid
,
678 librados::AioCompletionImpl
*c
,
683 // open the object. This will retrieve its layout and size
684 // and take a shared lock on it
685 ceph_file_layout layout
;
687 std::string lockCookie
;
688 int rc
= openStripedObjectForRead(soid
, &layout
, &size
, &lockCookie
);
690 // find out the actual number of bytes we can read
693 // nothing to read ! We are done.
696 read_len
= min(len
, (size_t)(size
-off
));
698 // get list of extents to be read from
699 vector
<ObjectExtent
> *extents
= new vector
<ObjectExtent
>();
701 std::string format
= soid
;
702 boost::replace_all(format
, "%", "%%");
703 format
+= RADOS_OBJECT_EXTENSION_FORMAT
;
705 l
.from_legacy(layout
);
706 Striper::file_to_extents(cct(), format
.c_str(), &l
, off
, read_len
,
710 // create a completion object and transfer ownership of extents and resultbl
711 vector
<bufferlist
> *resultbl
= new vector
<bufferlist
>(extents
->size());
712 auto cdata
= ceph::make_ref
<ReadCompletionData
>(this, soid
, lockCookie
, c
, bl
, extents
, resultbl
);
715 // create a completion for the unlocking of the striped object at the end of the read
716 librados::AioCompletion
*unlock_completion
=
717 librados::Rados::aio_create_completion(cdata
->get() /* create ref! */, rados_read_aio_unlock_complete
);
718 cdata
->m_unlockCompletion
= unlock_completion
;
719 // create the multiCompletion object handling the reads
720 MultiAioCompletionImplPtr nc
{new libradosstriper::MultiAioCompletionImpl
,
722 nc
->set_complete_callback(cdata
.get(), striper_read_aio_req_complete
);
723 // go through the extents
725 for (vector
<ObjectExtent
>::iterator p
= extents
->begin(); p
!= extents
->end(); ++p
) {
726 // create a buffer list describing where to place data read from current extend
727 bufferlist
*oid_bl
= &((*resultbl
)[i
++]);
728 for (vector
<pair
<uint64_t,uint64_t> >::iterator q
= p
->buffer_extents
.begin();
729 q
!= p
->buffer_extents
.end();
731 bufferlist buffer_bl
;
732 buffer_bl
.substr_of(*bl
, q
->first
, q
->second
);
733 oid_bl
->append(buffer_bl
);
735 // read all extends of a given object in one go
737 // we need 2 references on data as both rados_req_read_safe and rados_req_read_complete
739 auto data
= ceph::make_ref
<RadosReadCompletionData
>(nc
, p
->length
, oid_bl
, cct());
740 librados::AioCompletion
*rados_completion
=
741 librados::Rados::aio_create_completion(data
.detach(), rados_req_read_complete
);
742 r
= m_ioCtx
.aio_read(p
->oid
.name
, rados_completion
, oid_bl
, p
->length
, p
->offset
);
743 rados_completion
->release();
747 nc
->finish_adding_requests();
751 int libradosstriper::RadosStriperImpl::aio_read(const std::string
& soid
,
752 librados::AioCompletionImpl
*c
,
757 // create a buffer list and store it inside the completion object
759 c
->bl
.push_back(buffer::create_static(len
, buf
));
760 // call the bufferlist version of this method
761 return aio_read(soid
, c
, &c
->bl
, len
, off
);
764 int libradosstriper::RadosStriperImpl::aio_flush()
767 // pass to the rados level
768 ret
= m_ioCtx
.aio_flush();
771 //wait all CompletionData are released
772 std::unique_lock l
{lock
};
773 cond
.wait(l
, [this] {return m_refCnt
<= 1;});
777 ///////////////////////// stat and deletion /////////////////////////////
779 int libradosstriper::RadosStriperImpl::stat(const std::string
& soid
, uint64_t *psize
, time_t *pmtime
)
781 // create a completion object
782 librados::AioCompletionImpl c
;
783 // call asynchronous version of stat
784 int rc
= aio_stat(soid
, &c
, psize
, pmtime
);
786 // wait for completion of the remove
787 c
.wait_for_complete();
789 rc
= c
.get_return_value();
794 static void striper_stat_aio_stat_complete(rados_completion_t c
, void *arg
) {
795 auto data
= ceph::ref_t
<BasicStatCompletionData
>(static_cast<BasicStatCompletionData
*>(arg
), false);
796 int rc
= rados_aio_get_return_value(c
);
798 // remember this has failed
801 data
->m_multiCompletion
->complete_request(rc
);
804 static void striper_stat_aio_getxattr_complete(rados_completion_t c
, void *arg
) {
805 auto data
= ceph::ref_t
<BasicStatCompletionData
>(static_cast<BasicStatCompletionData
*>(arg
), false);
806 int rc
= rados_aio_get_return_value(c
);
807 // We need to handle the case of sparse files here
809 // remember this has failed
810 data
->m_getxattrRC
= rc
;
812 // this intermediate string allows to add a null terminator before calling strtol
814 std::string
strsize(data
->m_bl
.c_str(), data
->m_bl
.length());
815 *data
->m_psize
= strict_strtoll(strsize
.c_str(), 10, &err
);
817 lderr(data
->m_striper
->cct()) << XATTR_SIZE
<< " : " << err
<< dendl
;
818 data
->m_getxattrRC
= -EINVAL
;
822 data
->m_multiCompletion
->complete_request(rc
);
825 static void striper_stat_aio_req_complete(rados_striper_multi_completion_t c
,
827 auto data
= ceph::ref_t
<BasicStatCompletionData
>(static_cast<BasicStatCompletionData
*>(arg
), false);
828 if (data
->m_statRC
) {
829 data
->complete(data
->m_statRC
);
831 if (data
->m_getxattrRC
< 0) {
832 data
->complete(data
->m_getxattrRC
);
839 template<class TimeType
>
840 int libradosstriper::RadosStriperImpl::aio_generic_stat
841 (const std::string
& soid
,
842 librados::AioCompletionImpl
*c
,
845 typename
libradosstriper::RadosStriperImpl::StatFunction
<TimeType
>::Type statFunction
)
847 // use a MultiAioCompletion object for dealing with the fact
848 // that we'll do 2 asynchronous calls in parallel
849 MultiAioCompletionImplPtr multi_completion
{
850 new libradosstriper::MultiAioCompletionImpl
, false};
851 // Data object used for passing context to asynchronous calls
852 std::string firstObjOid
= getObjectId(soid
, 0);
853 auto cdata
= ceph::make_ref
<StatCompletionData
<TimeType
>>(this, firstObjOid
, c
, multi_completion
.get(), psize
, pmtime
);
854 multi_completion
->set_complete_callback(cdata
->get() /* create ref! */, striper_stat_aio_req_complete
);
855 // use a regular AioCompletion for the stat async call
856 librados::AioCompletion
*stat_completion
=
857 librados::Rados::aio_create_completion(cdata
->get() /* create ref! */, striper_stat_aio_stat_complete
);
858 multi_completion
->add_safe_request();
859 object_t
obj(firstObjOid
);
860 int rc
= (m_ioCtxImpl
->*statFunction
)(obj
, stat_completion
->pc
,
861 &cdata
->m_objectSize
, cdata
->m_pmtime
);
862 stat_completion
->release();
864 // nothing is really started so cancel everything
865 delete cdata
.detach();
868 // use a regular AioCompletion for the getxattr async call
869 librados::AioCompletion
*getxattr_completion
=
870 librados::Rados::aio_create_completion(cdata
->get() /* create ref! */, striper_stat_aio_getxattr_complete
);
871 multi_completion
->add_safe_request();
872 // in parallel, get the pmsize from the first object asynchronously
873 rc
= m_ioCtxImpl
->aio_getxattr(obj
, getxattr_completion
->pc
,
874 XATTR_SIZE
, cdata
->m_bl
);
875 getxattr_completion
->release();
876 multi_completion
->finish_adding_requests();
878 // the async stat is ongoing, so we need to go on
879 // we mark the getxattr as failed in the data object
880 cdata
->m_getxattrRC
= rc
;
881 multi_completion
->complete_request(rc
);
887 int libradosstriper::RadosStriperImpl::aio_stat(const std::string
& soid
,
888 librados::AioCompletionImpl
*c
,
892 return aio_generic_stat
<time_t>(soid
, c
, psize
, pmtime
, &librados::IoCtxImpl::aio_stat
);
895 int libradosstriper::RadosStriperImpl::stat2(const std::string
& soid
, uint64_t *psize
, struct timespec
*pts
)
897 // create a completion object
898 librados::AioCompletionImpl c
;
899 // call asynchronous version of stat
900 int rc
= aio_stat2(soid
, &c
, psize
, pts
);
902 // wait for completion of the remove
903 c
.wait_for_complete_and_cb();
905 rc
= c
.get_return_value();
910 int libradosstriper::RadosStriperImpl::aio_stat2(const std::string
& soid
,
911 librados::AioCompletionImpl
*c
,
913 struct timespec
*pts
)
915 return aio_generic_stat
<struct timespec
>(soid
, c
, psize
, pts
, &librados::IoCtxImpl::aio_stat2
);
918 static void rados_req_remove_complete(rados_completion_t c
, void *arg
)
920 auto cdata
= static_cast<RadosRemoveCompletionData
*>(arg
);
921 int rc
= rados_aio_get_return_value(c
);
922 // in case the object did not exist, it means we had a sparse file, all is fine
926 cdata
->m_multiAioCompl
->complete_request(rc
);
927 cdata
->m_multiAioCompl
->safe_request(rc
);
930 static void striper_remove_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
932 auto cdata
= ceph::ref_t
<RemoveCompletionData
>(static_cast<RemoveCompletionData
*>(arg
), false);
933 libradosstriper::MultiAioCompletionImpl
*comp
=
934 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
935 ldout(cdata
->m_striper
->cct(), 10)
936 << "RadosStriperImpl : striper_remove_aio_req_complete called for "
937 << cdata
->m_soid
<< dendl
;
940 // All went fine, synchronously remove first object
941 rc
= cdata
->m_striper
->m_ioCtx
.remove(cdata
->m_striper
->getObjectId(cdata
->m_soid
, 0),
944 lderr(cdata
->m_striper
->cct())
945 << "RadosStriperImpl : deletion/truncation incomplete for " << cdata
->m_soid
946 << ", as errors were encountered. The file is left present but it's content "
947 << " has been partially removed"
953 int libradosstriper::RadosStriperImpl::remove(const std::string
& soid
, int flags
)
955 // create a completion object
956 librados::AioCompletionImpl c
;
957 // call asynchronous version of remove
958 int rc
= aio_remove(soid
, &c
, flags
);
960 // wait for completion of the remove
961 c
.wait_for_complete_and_cb();
963 rc
= c
.get_return_value();
968 int libradosstriper::RadosStriperImpl::aio_remove(const std::string
& soid
,
969 librados::AioCompletionImpl
*c
,
972 // the RemoveCompletionData object will lock the given soid for the duration
974 std::string lockCookie
= getUUID();
975 int rc
= m_ioCtx
.lock_exclusive(getObjectId(soid
, 0), RADOS_LOCK_NAME
, lockCookie
, "", 0, 0);
977 // create CompletionData for the async remove call
978 auto cdata
= ceph::make_ref
<RemoveCompletionData
>(this, soid
, lockCookie
, c
, flags
);
979 MultiAioCompletionImplPtr multi_completion
{
980 new libradosstriper::MultiAioCompletionImpl
, false};
981 multi_completion
->set_complete_callback(cdata
->get() /* create ref! */, striper_remove_aio_req_complete
);
982 // call asynchronous internal version of remove
984 << "RadosStriperImpl : Aio_remove starting for "
986 rc
= internal_aio_remove(soid
, multi_completion
);
990 int libradosstriper::RadosStriperImpl::internal_aio_remove(
991 const std::string
& soid
,
992 MultiAioCompletionImplPtr multi_completion
,
995 std::string firstObjOid
= getObjectId(soid
, 0);
997 // check size and get number of rados objects to delete
998 uint64_t nb_objects
= 0;
1000 int rc
= getxattr(soid
, XATTR_SIZE
, bl2
);
1002 // no object size (or not able to get it)
1003 // try to find the number of object "by hand"
1006 while (!m_ioCtx
.stat(getObjectId(soid
, nb_objects
), &psize
, &pmtime
)) {
1010 // count total number of rados objects in the striped object
1012 // this intermediate string allows to add a null terminator before calling strtol
1013 std::string
strsize(bl2
.c_str(), bl2
.length());
1014 uint64_t size
= strict_strtoll(strsize
.c_str(), 10, &err
);
1016 lderr(cct()) << XATTR_SIZE
<< " : " << err
<< dendl
;
1020 uint64_t object_size
= m_layout
.fl_object_size
;
1021 uint64_t su
= m_layout
.fl_stripe_unit
;
1022 uint64_t stripe_count
= m_layout
.fl_stripe_count
;
1023 uint64_t nb_complete_sets
= size
/ (object_size
*stripe_count
);
1024 uint64_t remaining_data
= size
% (object_size
*stripe_count
);
1025 uint64_t remaining_stripe_units
= (remaining_data
+ su
-1) / su
;
1026 uint64_t remaining_objects
= std::min(remaining_stripe_units
, stripe_count
);
1027 nb_objects
= nb_complete_sets
* stripe_count
+ remaining_objects
;
1029 // delete rados objects in reverse order
1030 // Note that we do not drop the first object. This one will only be dropped
1031 // if all other removals have been successful, and this is done in the
1032 // callback of the multi_completion object
1034 for (int i
= nb_objects
-1; i
>= 1; i
--) {
1035 multi_completion
->add_request();
1036 auto data
= ceph::make_ref
<RadosRemoveCompletionData
>(multi_completion
, cct());
1037 librados::AioCompletion
*rados_completion
=
1038 librados::Rados::aio_create_completion(data
->get() /* create ref! */,
1039 rados_req_remove_complete
);
1041 rcr
= m_ioCtx
.aio_remove(getObjectId(soid
, i
), rados_completion
);
1043 rcr
= m_ioCtx
.aio_remove(getObjectId(soid
, i
), rados_completion
, flags
);
1045 rados_completion
->release();
1046 if (rcr
< 0 and -ENOENT
!= rcr
) {
1047 lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid
1048 << ", as " << getObjectId(soid
, i
) << " could not be deleted (rc=" << rc
<< ")"
1053 // we are over adding requests to the multi_completion object
1054 multi_completion
->finish_adding_requests();
1057 } catch (ErrorCode
&e
) {
1058 // error caught when trying to take the exclusive lock
1064 int libradosstriper::RadosStriperImpl::trunc(const std::string
& soid
, uint64_t size
)
1066 // lock the object in exclusive mode
1067 std::string firstObjOid
= getObjectId(soid
, 0);
1068 librados::ObjectWriteOperation op
;
1070 std::string lockCookie
= RadosStriperImpl::getUUID();
1071 utime_t dur
= utime_t();
1072 rados::cls::lock::lock(&op
, RADOS_LOCK_NAME
, ClsLockType::EXCLUSIVE
, lockCookie
, "", "", dur
, 0);
1073 int rc
= m_ioCtx
.operate(firstObjOid
, &op
);
1075 // load layout and size
1076 ceph_file_layout layout
;
1077 uint64_t original_size
;
1078 rc
= internal_get_layout_and_size(firstObjOid
, &layout
, &original_size
);
1080 if (size
< original_size
) {
1081 rc
= truncate(soid
, original_size
, size
, layout
);
1082 } else if (size
> original_size
) {
1083 rc
= grow(soid
, original_size
, size
, layout
);
1086 // unlock object, ignore return code as we cannot do much
1087 m_ioCtx
.unlock(firstObjOid
, RADOS_LOCK_NAME
, lockCookie
);
1093 ///////////////////////// private helpers /////////////////////////////
1095 std::string
libradosstriper::RadosStriperImpl::getObjectId(const object_t
& soid
,
1096 long long unsigned objectno
)
1098 std::ostringstream s
;
1099 s
<< soid
<< '.' << std::setfill ('0') << std::setw(16) << std::hex
<< objectno
;
1103 void libradosstriper::RadosStriperImpl::unlockObject(const std::string
& soid
,
1104 const std::string
& lockCookie
)
1106 // unlock the shared lock on the first rados object
1107 std::string firstObjOid
= getObjectId(soid
, 0);
1108 m_ioCtx
.unlock(firstObjOid
, RADOS_LOCK_NAME
, lockCookie
);
1111 void libradosstriper::RadosStriperImpl::aio_unlockObject(const std::string
& soid
,
1112 const std::string
& lockCookie
,
1113 librados::AioCompletion
*c
)
1115 // unlock the shared lock on the first rados object
1116 std::string firstObjOid
= getObjectId(soid
, 0);
1117 m_ioCtx
.aio_unlock(firstObjOid
, RADOS_LOCK_NAME
, lockCookie
, c
);
1120 static void rados_write_aio_unlock_complete(rados_striper_multi_completion_t c
, void *arg
)
1122 auto cdata
= ceph::ref_t
<WriteCompletionData
>(static_cast<WriteCompletionData
*>(arg
), false);
1123 libradosstriper::MultiAioCompletionImpl
*comp
=
1124 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1125 cdata
->complete_unlock(comp
->rval
);
1128 static void striper_write_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
1130 auto cdata
= ceph::ref_t
<WriteCompletionData
>(static_cast<WriteCompletionData
*>(arg
), false);
1131 // launch the async unlocking of the object
1132 cdata
->m_striper
->aio_unlockObject(cdata
->m_soid
, cdata
->m_lockCookie
, cdata
->m_unlockCompletion
);
1133 // complete the write part in parallel
1134 libradosstriper::MultiAioCompletionImpl
*comp
=
1135 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1136 cdata
->complete_write(comp
->rval
);
1139 static void striper_write_aio_req_safe(rados_striper_multi_completion_t c
, void *arg
)
1141 auto cdata
= ceph::ref_t
<WriteCompletionData
>(static_cast<WriteCompletionData
*>(arg
), false);
1142 libradosstriper::MultiAioCompletionImpl
*comp
=
1143 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1144 cdata
->safe(comp
->rval
);
1147 int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string
& soid
,
1148 const ceph_file_layout
& layout
,
1149 const std::string
& lockCookie
,
1150 const bufferlist
& bl
,
1153 // create a completion object to be passed to the callbacks of the multicompletion
1154 // we need 3 references as striper_write_aio_req_complete will release two and
1155 // striper_write_aio_req_safe will release one
1156 auto cdata
= ceph::make_ref
<WriteCompletionData
>(this, soid
, lockCookie
, nullptr);
1157 // create a completion object for the unlocking of the striped object at the end of the write
1158 librados::AioCompletion
*unlock_completion
=
1159 librados::Rados::aio_create_completion(cdata
->get() /* create ref! */, rados_write_aio_unlock_complete
);
1160 cdata
->m_unlockCompletion
= unlock_completion
;
1161 // create the multicompletion that will handle the write completion
1162 MultiAioCompletionImplPtr c
{new libradosstriper::MultiAioCompletionImpl
,
1164 c
->set_complete_callback(cdata
->get() /* create ref! */, striper_write_aio_req_complete
);
1165 c
->set_safe_callback(cdata
->get() /* create ref! */, striper_write_aio_req_safe
);
1166 // call the asynchronous API
1167 int rc
= internal_aio_write(soid
, c
, bl
, len
, off
, layout
);
1169 // wait for completion and safety of data
1170 c
->wait_for_complete_and_cb();
1171 c
->wait_for_safe_and_cb();
1172 // wait for the unlocking
1173 unlock_completion
->wait_for_complete();
1175 rc
= c
->get_return_value();
1180 int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string
& soid
,
1181 librados::AioCompletionImpl
*c
,
1182 const ceph_file_layout
& layout
,
1183 const std::string
& lockCookie
,
1184 const bufferlist
& bl
,
1187 // create a completion object to be passed to the callbacks of the multicompletion
1188 // we need 3 references as striper_write_aio_req_complete will release two and
1189 // striper_write_aio_req_safe will release one
1190 auto cdata
= ceph::make_ref
<WriteCompletionData
>(this, soid
, lockCookie
, c
);
1192 c
->io
= m_ioCtxImpl
;
1193 // create a completion object for the unlocking of the striped object at the end of the write
1194 librados::AioCompletion
*unlock_completion
=
1195 librados::Rados::aio_create_completion(cdata
->get() /* create ref! */, rados_write_aio_unlock_complete
);
1196 cdata
->m_unlockCompletion
= unlock_completion
;
1197 // create the multicompletion that will handle the write completion
1198 libradosstriper::MultiAioCompletionImplPtr nc
{
1199 new libradosstriper::MultiAioCompletionImpl
, false};
1200 nc
->set_complete_callback(cdata
->get() /* create ref! */, striper_write_aio_req_complete
);
1201 nc
->set_safe_callback(cdata
->get() /* create ref! */, striper_write_aio_req_safe
);
1202 // internal asynchronous API
1203 int rc
= internal_aio_write(soid
, nc
, bl
, len
, off
, layout
);
1207 static void rados_req_write_complete(rados_completion_t c
, void *arg
)
1209 auto comp
= reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(arg
);
1210 comp
->complete_request(rados_aio_get_return_value(c
));
1211 comp
->safe_request(rados_aio_get_return_value(c
));
1215 libradosstriper::RadosStriperImpl::internal_aio_write(const std::string
& soid
,
1216 libradosstriper::MultiAioCompletionImplPtr c
,
1217 const bufferlist
& bl
,
1220 const ceph_file_layout
& layout
)
1223 // Do not try anything if we are called with empty buffer,
1224 // file_to_extents would raise an exception
1226 // get list of extents to be written to
1227 vector
<ObjectExtent
> extents
;
1228 std::string format
= soid
;
1229 boost::replace_all(format
, "%", "%%");
1230 format
+= RADOS_OBJECT_EXTENSION_FORMAT
;
1232 l
.from_legacy(layout
);
1233 Striper::file_to_extents(cct(), format
.c_str(), &l
, off
, len
, 0, extents
);
1234 // go through the extents
1235 for (vector
<ObjectExtent
>::iterator p
= extents
.begin(); p
!= extents
.end(); ++p
) {
1236 // assemble pieces of a given object into a single buffer list
1238 for (vector
<pair
<uint64_t,uint64_t> >::iterator q
= p
->buffer_extents
.begin();
1239 q
!= p
->buffer_extents
.end();
1241 bufferlist buffer_bl
;
1242 buffer_bl
.substr_of(bl
, q
->first
, q
->second
);
1243 oid_bl
.append(buffer_bl
);
1245 // and write the object
1247 librados::AioCompletion
*rados_completion
=
1248 librados::Rados::aio_create_completion(c
.get(),
1249 rados_req_write_complete
);
1250 r
= m_ioCtx
.aio_write(p
->oid
.name
, rados_completion
, oid_bl
,
1251 p
->length
, p
->offset
);
1252 rados_completion
->release();
1257 c
->finish_adding_requests();
1261 int libradosstriper::RadosStriperImpl::extract_uint32_attr
1262 (std::map
<std::string
, bufferlist
> &attrs
,
1263 const std::string
& key
,
1266 std::map
<std::string
, bufferlist
>::iterator attrsIt
= attrs
.find(key
);
1267 if (attrsIt
!= attrs
.end()) {
1268 // this intermediate string allows to add a null terminator before calling strtol
1269 std::string
strvalue(attrsIt
->second
.c_str(), attrsIt
->second
.length());
1271 *value
= strict_strtol(strvalue
.c_str(), 10, &err
);
1273 lderr(cct()) << key
<< " : " << err
<< dendl
;
1282 int libradosstriper::RadosStriperImpl::extract_sizet_attr
1283 (std::map
<std::string
, bufferlist
> &attrs
,
1284 const std::string
& key
,
1287 std::map
<std::string
, bufferlist
>::iterator attrsIt
= attrs
.find(key
);
1288 if (attrsIt
!= attrs
.end()) {
1289 // this intermediate string allows to add a null terminator before calling strtol
1290 std::string
strvalue(attrsIt
->second
.c_str(), attrsIt
->second
.length());
1292 *value
= strict_strtoll(strvalue
.c_str(), 10, &err
);
1294 lderr(cct()) << key
<< " : " << err
<< dendl
;
1303 int libradosstriper::RadosStriperImpl::internal_get_layout_and_size(
1304 const std::string
& oid
,
1305 ceph_file_layout
*layout
,
1308 // get external attributes of the first rados object
1309 std::map
<std::string
, bufferlist
> attrs
;
1310 int rc
= m_ioCtx
.getxattrs(oid
, attrs
);
1312 // deal with stripe_unit
1313 rc
= extract_uint32_attr(attrs
, XATTR_LAYOUT_STRIPE_UNIT
, &layout
->fl_stripe_unit
);
1315 // deal with stripe_count
1316 rc
= extract_uint32_attr(attrs
, XATTR_LAYOUT_STRIPE_COUNT
, &layout
->fl_stripe_count
);
1318 // deal with object_size
1319 rc
= extract_uint32_attr(attrs
, XATTR_LAYOUT_OBJECT_SIZE
, &layout
->fl_object_size
);
1323 rc
= extract_sizet_attr(attrs
, XATTR_SIZE
, &ssize
);
1328 // make valgrind happy by setting unused fl_pg_pool
1329 layout
->fl_pg_pool
= 0;
1333 int libradosstriper::RadosStriperImpl::openStripedObjectForRead(
1334 const std::string
& soid
,
1335 ceph_file_layout
*layout
,
1337 std::string
*lockCookie
)
1339 // take a lock the first rados object, if it exists and gets its size
1340 // check, lock and size reading must be atomic and are thus done within a single operation
1341 librados::ObjectWriteOperation op
;
1343 *lockCookie
= getUUID();
1344 utime_t dur
= utime_t();
1345 rados::cls::lock::lock(&op
, RADOS_LOCK_NAME
, ClsLockType::SHARED
, *lockCookie
, "Tag", "", dur
, 0);
1346 std::string firstObjOid
= getObjectId(soid
, 0);
1347 int rc
= m_ioCtx
.operate(firstObjOid
, &op
);
1349 // error case (including -ENOENT)
1352 rc
= internal_get_layout_and_size(firstObjOid
, layout
, size
);
1354 unlockObject(soid
, *lockCookie
);
1355 lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : "
1356 << "could not load layout and size for "
1357 << soid
<< " : rc = " << rc
<< dendl
;
1362 int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string
& soid
,
1363 ceph_file_layout
*layout
,
1365 std::string
*lockCookie
,
1366 bool isFileSizeAbsolute
)
1368 // take a lock the first rados object, if it exists
1369 // check and lock must be atomic and are thus done within a single operation
1370 librados::ObjectWriteOperation op
;
1372 *lockCookie
= getUUID();
1373 utime_t dur
= utime_t();
1374 rados::cls::lock::lock(&op
, RADOS_LOCK_NAME
, ClsLockType::SHARED
, *lockCookie
, "Tag", "", dur
, 0);
1375 std::string firstObjOid
= getObjectId(soid
, 0);
1376 int rc
= m_ioCtx
.operate(firstObjOid
, &op
);
1378 if (rc
== -ENOENT
) {
1379 // object does not exist, delegate to createEmptyStripedObject
1380 int rc
= createAndOpenStripedObject(soid
, layout
, *size
, lockCookie
, isFileSizeAbsolute
);
1381 // return original size
1390 rc
= internal_get_layout_and_size(firstObjOid
, layout
, &curSize
);
1392 unlockObject(soid
, *lockCookie
);
1393 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1394 << "could not load layout and size for "
1395 << soid
<< " : rc = " << rc
<< dendl
;
1398 // atomically update object size, only if smaller than current one
1399 if (!isFileSizeAbsolute
)
1401 librados::ObjectWriteOperation writeOp
;
1402 writeOp
.cmpxattr(XATTR_SIZE
, LIBRADOS_CMPXATTR_OP_GT
, *size
);
1403 std::ostringstream oss
;
1406 bl
.append(oss
.str());
1407 writeOp
.setxattr(XATTR_SIZE
, bl
);
1408 rc
= m_ioCtx
.operate(firstObjOid
, &writeOp
);
1409 // return current size
1411 // handle case where objectsize is already bigger than size
1412 if (-ECANCELED
== rc
)
1415 unlockObject(soid
, *lockCookie
);
1416 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1417 << "could not set new size for "
1418 << soid
<< " : rc = " << rc
<< dendl
;
1423 int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string
& soid
,
1424 ceph_file_layout
*layout
,
1426 std::string
*lockCookie
,
1427 bool isFileSizeAbsolute
)
1429 // build atomic write operation
1430 librados::ObjectWriteOperation writeOp
;
1431 writeOp
.create(true);
1433 std::ostringstream oss_object_size
;
1434 oss_object_size
<< m_layout
.fl_object_size
;
1435 bufferlist bl_object_size
;
1436 bl_object_size
.append(oss_object_size
.str());
1437 writeOp
.setxattr(XATTR_LAYOUT_OBJECT_SIZE
, bl_object_size
);
1439 std::ostringstream oss_stripe_unit
;
1440 oss_stripe_unit
<< m_layout
.fl_stripe_unit
;
1441 bufferlist bl_stripe_unit
;
1442 bl_stripe_unit
.append(oss_stripe_unit
.str());
1443 writeOp
.setxattr(XATTR_LAYOUT_STRIPE_UNIT
, bl_stripe_unit
);
1445 std::ostringstream oss_stripe_count
;
1446 oss_stripe_count
<< m_layout
.fl_stripe_count
;
1447 bufferlist bl_stripe_count
;
1448 bl_stripe_count
.append(oss_stripe_count
.str());
1449 writeOp
.setxattr(XATTR_LAYOUT_STRIPE_COUNT
, bl_stripe_count
);
1451 std::ostringstream oss_size
;
1452 oss_size
<< (isFileSizeAbsolute
?size
:0);
1454 bl_size
.append(oss_size
.str());
1455 writeOp
.setxattr(XATTR_SIZE
, bl_size
);
1456 // effectively change attributes
1457 std::string firstObjOid
= getObjectId(soid
, 0);
1458 int rc
= m_ioCtx
.operate(firstObjOid
, &writeOp
);
1459 // in case of error (but no EEXIST which would mean the object existed), return
1460 if (rc
&& -EEXIST
!= rc
) return rc
;
1461 // Otherwise open the object
1462 uint64_t fileSize
= size
;
1463 return openStripedObjectForWrite(soid
, layout
, &fileSize
, lockCookie
, isFileSizeAbsolute
);
1466 static void striper_truncate_aio_req_complete(rados_striper_multi_completion_t c
, void *arg
)
1468 auto cdata
= ceph::ref_t
<TruncateCompletionData
>(static_cast<TruncateCompletionData
*>(arg
), false);
1469 libradosstriper::MultiAioCompletionImpl
*comp
=
1470 reinterpret_cast<libradosstriper::MultiAioCompletionImpl
*>(c
);
1471 if (0 == comp
->rval
) {
1472 // all went fine, change size in the external attributes
1473 std::ostringstream oss
;
1474 oss
<< cdata
->m_size
;
1476 bl
.append(oss
.str());
1477 cdata
->m_striper
->setxattr(cdata
->m_soid
, XATTR_SIZE
, bl
);
1481 int libradosstriper::RadosStriperImpl::truncate(const std::string
& soid
,
1482 uint64_t original_size
,
1484 ceph_file_layout
&layout
)
1486 auto cdata
= ceph::make_ref
<TruncateCompletionData
>(this, soid
, size
);
1487 libradosstriper::MultiAioCompletionImplPtr multi_completion
{
1488 new libradosstriper::MultiAioCompletionImpl
, false};
1489 multi_completion
->set_complete_callback(cdata
->get() /* create ref! */, striper_truncate_aio_req_complete
);
1490 // call asynchrous version of truncate
1491 int rc
= aio_truncate(soid
, multi_completion
, original_size
, size
, layout
);
1492 // wait for completion of the truncation
1493 multi_completion
->finish_adding_requests();
1494 multi_completion
->wait_for_complete_and_cb();
1497 rc
= multi_completion
->get_return_value();
1502 int libradosstriper::RadosStriperImpl::aio_truncate
1503 (const std::string
& soid
,
1504 libradosstriper::MultiAioCompletionImplPtr multi_completion
,
1505 uint64_t original_size
,
1507 ceph_file_layout
&layout
)
1509 // handle the underlying rados objects. 3 cases here :
1510 // -- the objects belonging to object sets entirely located
1511 // before the truncation are unchanged
1512 // -- the objects belonging to the object set where the
1513 // truncation took place are truncated or removed
1514 // -- the objects belonging to object sets entirely located
1515 // after the truncation are removed
1516 // Note that we do it backward and that we change the size in
1517 // the external attributes only at the end. This make sure that
1518 // no rados object stays behind if we remove the striped object
1519 // after a truncation has failed
1520 uint64_t trunc_objectsetno
= size
/ layout
.fl_object_size
/ layout
.fl_stripe_count
;
1521 uint64_t last_objectsetno
= original_size
/ layout
.fl_object_size
/ layout
.fl_stripe_count
;
1522 bool exists
= false;
1523 for (int64_t objectno
= (last_objectsetno
+1) * layout
.fl_stripe_count
-1;
1524 objectno
>= (int64_t)((trunc_objectsetno
+ 1) * layout
.fl_stripe_count
);
1526 // if no object existed so far, check object existence
1528 uint64_t nb_full_object_set
= objectno
/ layout
.fl_stripe_count
;
1529 uint64_t object_index_in_set
= objectno
% layout
.fl_stripe_count
;
1530 uint64_t set_start_off
= nb_full_object_set
* layout
.fl_object_size
* layout
.fl_stripe_count
;
1531 uint64_t object_start_off
= set_start_off
+ object_index_in_set
* layout
.fl_stripe_unit
;
1532 exists
= (original_size
> object_start_off
);
1535 // remove asynchronously
1536 multi_completion
->add_request();
1537 auto data
= ceph::make_ref
<RadosRemoveCompletionData
>(multi_completion
, cct());
1538 librados::AioCompletion
*rados_completion
=
1539 librados::Rados::aio_create_completion(data
->get() /* create ref! */,
1540 rados_req_remove_complete
);
1541 int rc
= m_ioCtx
.aio_remove(getObjectId(soid
, objectno
), rados_completion
);
1542 rados_completion
->release();
1543 // in case the object did not exist, it means we had a sparse file, all is fine
1544 if (rc
&& rc
!= -ENOENT
) return rc
;
1547 for (int64_t objectno
= ((trunc_objectsetno
+ 1) * layout
.fl_stripe_count
) -1;
1548 objectno
>= (int64_t)(trunc_objectsetno
* layout
.fl_stripe_count
);
1550 // if no object existed so far, check object existence
1552 uint64_t object_start_off
= ((objectno
/ layout
.fl_stripe_count
) * layout
.fl_object_size
) +
1553 ((objectno
% layout
.fl_stripe_count
) * layout
.fl_stripe_unit
);
1554 exists
= (original_size
> object_start_off
);
1559 l
.from_legacy(layout
);
1560 uint64_t new_object_size
= Striper::object_truncate_size(cct(), &l
, objectno
, size
);
1562 if (new_object_size
> 0 or 0 == objectno
) {
1563 // trunc is synchronous as there is no async version
1564 // but note that only a single object will be truncated
1565 // reducing the overload to a fixed amount
1566 rc
= m_ioCtx
.trunc(getObjectId(soid
, objectno
), new_object_size
);
1568 // removes are asynchronous in order to speed up truncations of big files
1569 multi_completion
->add_request();
1570 auto data
= ceph::make_ref
<RadosRemoveCompletionData
>(multi_completion
, cct());
1571 librados::AioCompletion
*rados_completion
=
1572 librados::Rados::aio_create_completion(data
->get() /* create ref! */,
1573 rados_req_remove_complete
);
1574 rc
= m_ioCtx
.aio_remove(getObjectId(soid
, objectno
), rados_completion
);
1575 rados_completion
->release();
1577 // in case the object did not exist, it means we had a sparse file, all is fine
1578 if (rc
&& rc
!= -ENOENT
) return rc
;
1584 int libradosstriper::RadosStriperImpl::grow(const std::string
& soid
,
1585 uint64_t original_size
,
1587 ceph_file_layout
&layout
)
1589 // handle the underlying rados objects. As we support sparse objects,
1590 // we only have to change the size in the external attributes
1591 std::ostringstream oss
;
1594 bl
.append(oss
.str());
1595 int rc
= m_ioCtx
.setxattr(getObjectId(soid
, 0), XATTR_SIZE
, bl
);
1599 std::string
libradosstriper::RadosStriperImpl::getUUID()
1602 uuid
.generate_random();
1605 return std::string(suuid
);