]> git.proxmox.com Git - ceph.git/blame - ceph/src/libradosstriper/RadosStriperImpl.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / libradosstriper / RadosStriperImpl.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
c07f9fc5
FG
15#include <boost/algorithm/string/replace.hpp>
16
7c673cae
FG
17#include "libradosstriper/RadosStriperImpl.h"
18
19#include <errno.h>
20
21#include <sstream>
22#include <iomanip>
23#include <algorithm>
24
25#include "include/types.h"
26#include "include/uuid.h"
27#include "include/ceph_fs.h"
28#include "common/dout.h"
29#include "common/strtol.h"
9f95a23c 30#include "common/RefCountedObj.h"
7c673cae 31#include "osdc/Striper.h"
7c673cae
FG
32#include "librados/AioCompletionImpl.h"
33#include <cls/lock/cls_lock_client.h>
34
35/*
36 * This file contents the actual implementation of the rados striped objects interface.
37 *
38 * Striped objects are stored in rados in a set of regular rados objects, after their
39 * content has been striped using the osdc/Striper interface.
40 *
41 * The external attributes of the striped object are mapped to the attributes of the
42 * first underlying object. This first object has a set of extra external attributes
43 * storing the layout of the striped object for future read back. These attributes are :
44 * - striper.layout.object_size : the size of rados objects used.
45 * Must be a multiple of striper.layout.stripe_unit
46 * - striper.layout.stripe_unit : the size of a stripe unit
47 * - striper.layout.stripe_count : the number of stripes used
48 * - striper.size : total striped object size
49 *
50 * In general operations on striped objects are not atomic.
51 * However, a certain number of safety guards have been put to make the interface closer
52 * to atomicity :
53 * - each data operation takes a shared lock on the first rados object for the
54 * whole time of the operation
55 * - the remove and trunc operations take an exclusive lock on the first rados object
56 * for the whole time of the operation
57 * This makes sure that no removal/truncation of a striped object occurs while
58 * data operations are happening and vice versa. It thus makes sure that the layout
59 * of a striped object does not change during data operation, which is essential for
60 * data consistency.
61 *
62 * Still the writing to a striped object is not atomic. This means in particular that
63 * the size of an object may not be in sync with its content at all times.
11fdf7f2 64 * As the size is always guaranteed to be updated first and in an atomic way, and as
7c673cae
FG
65 * sparse striped objects are supported (see below), what will typically happen is
66 * that a reader that comes too soon after a write will read 0s instead of the actual
67 * data.
68 *
69 * Note that remove handles the pieces of the striped object in reverse order,
70 * so that the head object is removed last, making the completion of the deletion atomic.
71 *
72 * Striped objects can be sparse, typically in case data was written at the end of the
73 * striped object only. In such a case, some rados objects constituing the striped object
74 * may be missing. Other can be partial (only the beginning will have data)
75 * When dealing with such sparse striped files, missing objects are detected and
76 * considered as full of 0s. They are however not created until real data is written
77 * to them.
78 *
79 * There are a number of missing features/improvements that could be implemented.
80 * Here are some ideas :
81 * - implementation of missing entry points (compared to rados)
82 * In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ...
83 *
84 */
85
86#define dout_subsys ceph_subsys_rados
87#undef dout_prefix
88#define dout_prefix *_dout << "libradosstriper: "
89
90/// size of xattr buffer
91#define XATTR_BUFFER_SIZE 32
92
93/// names of the different xattr entries
94#define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit"
95#define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count"
96#define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size"
97#define XATTR_SIZE "striper.size"
98#define LOCK_PREFIX "lock."
99
100/// name of the lock used on objects to ensure layout stability during IO
101#define RADOS_LOCK_NAME "striper.lock"
102
103/// format of the extension of rados objects created for a given striped object
104#define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx"
105
106/// default object layout
20effc67
TL
107static const struct ceph_file_layout default_file_layout = {
108 ceph_le32(1<<22), // fl_stripe_unit
109 ceph_le32(1), // fl_stripe_count
110 ceph_le32(1<<22), // fl_object_size
111 ceph_le32(0), // fl_cas_hash
112 ceph_le32(0), // fl_object_stripe_unit
113 ceph_le32(-1), // fl_unused
114 ceph_le32(-1), // fl_pg_pool
7c673cae
FG
115};
116
20effc67
TL
117using std::map;
118using std::pair;
119using std::string;
120using std::vector;
224ce89b
WB
121using libradosstriper::MultiAioCompletionImplPtr;
122
123namespace {
7c673cae
FG
124
125///////////////////////// CompletionData /////////////////////////////
126
224ce89b
WB
127/**
128 * struct handling the data needed to pass to the call back
129 * function in asynchronous operations
130 */
131struct CompletionData : RefCountedObject {
224ce89b
WB
132 /// complete method
133 void complete(int r);
134 /// striper to be used to handle the write completion
135 libradosstriper::RadosStriperImpl *m_striper;
136 /// striped object concerned by the write operation
137 std::string m_soid;
138 /// shared lock to be released at completion
139 std::string m_lockCookie;
140 /// completion handler
141 librados::IoCtxImpl::C_aio_Complete *m_ack;
9f95a23c
TL
142protected:
143 CompletionData(libradosstriper::RadosStriperImpl * striper,
144 const std::string& soid,
145 const std::string& lockCookie,
146 librados::AioCompletionImpl *userCompletion = 0);
147 ~CompletionData() override;
148
224ce89b
WB
149};
150
151CompletionData::CompletionData
7c673cae
FG
152(libradosstriper::RadosStriperImpl* striper,
153 const std::string& soid,
154 const std::string& lockCookie,
9f95a23c
TL
155 librados::AioCompletionImpl *userCompletion) :
156 RefCountedObject(striper->cct()),
7c673cae
FG
157 m_striper(striper), m_soid(soid), m_lockCookie(lockCookie), m_ack(0) {
158 m_striper->get();
159 if (userCompletion) {
160 m_ack = new librados::IoCtxImpl::C_aio_Complete(userCompletion);
161 userCompletion->io = striper->m_ioCtxImpl;
162 }
163}
164
224ce89b 165CompletionData::~CompletionData() {
7c673cae
FG
166 if (m_ack) delete m_ack;
167 m_striper->put();
168}
169
224ce89b 170void CompletionData::complete(int r) {
7c673cae
FG
171 if (m_ack) m_ack->finish(r);
172}
173
224ce89b
WB
174/**
175 * struct handling the data needed to pass to the call back
176 * function in asynchronous read operations
177 */
178struct ReadCompletionData : CompletionData {
179 /// bufferlist containing final result
180 bufferlist* m_bl;
181 /// extents that will be read
182 std::vector<ObjectExtent>* m_extents;
183 /// intermediate results
184 std::vector<bufferlist>* m_resultbl;
185 /// return code of read completion, to be remembered until unlocking happened
186 int m_readRc;
187 /// completion object for the unlocking of the striped object at the end of the read
188 librados::AioCompletion *m_unlockCompletion;
9f95a23c
TL
189 /// complete method for when reading is over
190 void complete_read(int r);
191 /// complete method for when object is unlocked
192 void complete_unlock(int r);
193
194private:
195 FRIEND_MAKE_REF(ReadCompletionData);
224ce89b
WB
196 ReadCompletionData(libradosstriper::RadosStriperImpl * striper,
197 const std::string& soid,
198 const std::string& lockCookie,
199 librados::AioCompletionImpl *userCompletion,
200 bufferlist* bl,
201 std::vector<ObjectExtent>* extents,
9f95a23c 202 std::vector<bufferlist>* resultbl);
224ce89b 203 ~ReadCompletionData() override;
224ce89b
WB
204};
205
206ReadCompletionData::ReadCompletionData
7c673cae
FG
207(libradosstriper::RadosStriperImpl* striper,
208 const std::string& soid,
209 const std::string& lockCookie,
210 librados::AioCompletionImpl *userCompletion,
211 bufferlist* bl,
212 std::vector<ObjectExtent>* extents,
9f95a23c
TL
213 std::vector<bufferlist>* resultbl) :
214 CompletionData(striper, soid, lockCookie, userCompletion),
7c673cae
FG
215 m_bl(bl), m_extents(extents), m_resultbl(resultbl), m_readRc(0),
216 m_unlockCompletion(0) {}
217
224ce89b 218ReadCompletionData::~ReadCompletionData() {
7c673cae
FG
219 m_unlockCompletion->release();
220 delete m_extents;
221 delete m_resultbl;
222}
223
224ce89b 224void ReadCompletionData::complete_read(int r) {
7c673cae
FG
225 // gather data into final buffer
226 Striper::StripedReadResult readResult;
227 vector<bufferlist>::iterator bit = m_resultbl->begin();
228 for (vector<ObjectExtent>::iterator eit = m_extents->begin();
229 eit != m_extents->end();
230 ++eit, ++bit) {
231 readResult.add_partial_result(m_striper->cct(), *bit, eit->buffer_extents);
232 }
233 m_bl->clear();
234 readResult.assemble_result(m_striper->cct(), *m_bl, true);
235 // Remember return code
236 m_readRc = r;
237}
238
224ce89b 239void ReadCompletionData::complete_unlock(int r) {
7c673cae
FG
240 // call parent's completion method
241 // Note that we ignore the return code of the unlock as we cannot do much about it
242 CompletionData::complete(m_readRc?m_readRc:m_bl->length());
243}
244
224ce89b
WB
245/**
246 * struct handling the data needed to pass to the call back
247 * function in asynchronous write operations
248 */
249struct WriteCompletionData : CompletionData {
250 /// safe completion handler
251 librados::IoCtxImpl::C_aio_Complete *m_safe;
224ce89b
WB
252 /// completion object for the unlocking of the striped object at the end of the write
253 librados::AioCompletion *m_unlockCompletion;
11fdf7f2
TL
254 /// return code of write completion, to be remembered until unlocking happened
255 int m_writeRc;
224ce89b
WB
256 /// complete method for when writing is over
257 void complete_write(int r);
258 /// complete method for when object is unlocked
259 void complete_unlock(int r);
260 /// safe method
261 void safe(int r);
9f95a23c
TL
262private:
263 FRIEND_MAKE_REF(WriteCompletionData);
264 /// constructor
265 WriteCompletionData(libradosstriper::RadosStriperImpl * striper,
266 const std::string& soid,
267 const std::string& lockCookie,
268 librados::AioCompletionImpl *userCompletion);
269 /// destructor
270 ~WriteCompletionData() override;
224ce89b
WB
271};
272
273WriteCompletionData::WriteCompletionData
7c673cae
FG
274(libradosstriper::RadosStriperImpl* striper,
275 const std::string& soid,
276 const std::string& lockCookie,
9f95a23c
TL
277 librados::AioCompletionImpl *userCompletion) :
278 CompletionData(striper, soid, lockCookie, userCompletion),
279 m_safe(0), m_unlockCompletion(0), m_writeRc(0) {
7c673cae
FG
280 if (userCompletion) {
281 m_safe = new librados::IoCtxImpl::C_aio_Complete(userCompletion);
282 }
283}
284
224ce89b 285WriteCompletionData::~WriteCompletionData() {
7c673cae
FG
286 m_unlockCompletion->release();
287 if (m_safe) delete m_safe;
288}
289
224ce89b 290void WriteCompletionData::complete_unlock(int r) {
7c673cae
FG
291 // call parent's completion method
292 // Note that we ignore the return code of the unlock as we cannot do much about it
293 CompletionData::complete(m_writeRc);
294}
295
224ce89b 296void WriteCompletionData::complete_write(int r) {
7c673cae
FG
297 // Remember return code
298 m_writeRc = r;
299}
300
224ce89b 301void WriteCompletionData::safe(int r) {
7c673cae
FG
302 if (m_safe) m_safe->finish(r);
303}
304
224ce89b
WB
305struct RemoveCompletionData : CompletionData {
306 /// removal flags
307 int flags;
9f95a23c
TL
308
309private:
310 FRIEND_MAKE_REF(RemoveCompletionData);
224ce89b
WB
311 /**
312 * constructor
313 * note that the constructed object will take ownership of the lock
314 */
315 RemoveCompletionData(libradosstriper::RadosStriperImpl * striper,
316 const std::string& soid,
317 const std::string& lockCookie,
318 librados::AioCompletionImpl *userCompletion,
319 int flags = 0) :
7c673cae 320 CompletionData(striper, soid, lockCookie, userCompletion), flags(flags) {}
224ce89b 321};
7c673cae 322
224ce89b
WB
323/**
324 * struct handling the data needed to pass to the call back
325 * function in asynchronous truncate operations
326 */
327struct TruncateCompletionData : RefCountedObject {
9f95a23c
TL
328 /// striper to be used
329 libradosstriper::RadosStriperImpl *m_striper;
330 /// striped object concerned by the truncate operation
331 std::string m_soid;
332 /// the final size of the truncated object
333 uint64_t m_size;
334
335private:
336 FRIEND_MAKE_REF(TruncateCompletionData);
224ce89b
WB
337 /// constructor
338 TruncateCompletionData(libradosstriper::RadosStriperImpl* striper,
339 const std::string& soid,
340 uint64_t size) :
341 RefCountedObject(striper->cct()),
342 m_striper(striper), m_soid(soid), m_size(size) {
343 m_striper->get();
344 }
345 /// destructor
346 ~TruncateCompletionData() override {
347 m_striper->put();
348 }
224ce89b 349};
7c673cae 350
224ce89b
WB
351/**
352 * struct handling the data needed to pass to the call back
353 * function in asynchronous read operations of a Rados File
354 */
355struct RadosReadCompletionData : RefCountedObject {
224ce89b
WB
356 /// the multi asynch io completion object to be used
357 MultiAioCompletionImplPtr m_multiAioCompl;
358 /// the expected number of bytes
359 uint64_t m_expectedBytes;
360 /// the bufferlist object where data have been written
361 bufferlist *m_bl;
9f95a23c
TL
362
363private:
364 FRIEND_MAKE_REF(RadosReadCompletionData);
365 /// constructor
366 RadosReadCompletionData(MultiAioCompletionImplPtr multiAioCompl,
367 uint64_t expectedBytes,
368 bufferlist *bl,
369 CephContext *context) :
370 RefCountedObject(context),
371 m_multiAioCompl(multiAioCompl), m_expectedBytes(expectedBytes), m_bl(bl) {}
224ce89b
WB
372};
373
374/**
375 * struct handling (most of) the data needed to pass to the call back
376 * function in asynchronous stat operations.
377 * Inherited by the actual type for adding time information in different
378 * versions (time_t or struct timespec)
379 */
380struct BasicStatCompletionData : CompletionData {
224ce89b
WB
381 // MultiAioCompletionImpl used to handle the double aysnc
382 // call in the back (stat + getxattr)
383 libradosstriper::MultiAioCompletionImpl *m_multiCompletion;
384 // where to store the size of first objct
385 // this will be ignored but we need a place to store it when
386 // async stat is called
387 uint64_t m_objectSize;
388 // where to store the file size
389 uint64_t *m_psize;
390 /// the bufferlist object used for the getxattr call
391 bufferlist m_bl;
392 /// return code of the stat
393 int m_statRC;
394 /// return code of the getxattr
395 int m_getxattrRC;
9f95a23c
TL
396
397protected:
398 /// constructor
399 BasicStatCompletionData(libradosstriper::RadosStriperImpl* striper,
400 const std::string& soid,
401 librados::AioCompletionImpl *userCompletion,
402 libradosstriper::MultiAioCompletionImpl *multiCompletion,
403 uint64_t *psize) :
404 CompletionData(striper, soid, "", userCompletion),
405 m_multiCompletion(multiCompletion), m_psize(psize),
406 m_statRC(0), m_getxattrRC(0) {};
407
224ce89b
WB
408};
409
410/**
411 * struct handling the data needed to pass to the call back
412 * function in asynchronous stat operations.
413 * Simple templated extension of BasicStatCompletionData.
414 * The template parameter is the type of the time information
415 * (used with time_t for stat and struct timespec for stat2)
416 */
417template<class TimeType>
418struct StatCompletionData : BasicStatCompletionData {
9f95a23c
TL
419 // where to store the file time
420 TimeType *m_pmtime;
421private:
422 FRIEND_MAKE_REF(StatCompletionData);
224ce89b 423 /// constructor
9f95a23c 424 StatCompletionData<TimeType>(libradosstriper::RadosStriperImpl* striper,
224ce89b
WB
425 const std::string& soid,
426 librados::AioCompletionImpl *userCompletion,
427 libradosstriper::MultiAioCompletionImpl *multiCompletion,
428 uint64_t *psize,
9f95a23c
TL
429 TimeType *pmtime) :
430 BasicStatCompletionData(striper, soid, userCompletion, multiCompletion, psize),
224ce89b 431 m_pmtime(pmtime) {};
224ce89b
WB
432};
433
434/**
435 * struct handling the data needed to pass to the call back
436 * function in asynchronous remove operations of a Rados File
437 */
438struct RadosRemoveCompletionData : RefCountedObject {
9f95a23c
TL
439 /// the multi asynch io completion object to be used
440 MultiAioCompletionImplPtr m_multiAioCompl;
441private:
442 FRIEND_MAKE_REF(RadosRemoveCompletionData);
224ce89b
WB
443 /// constructor
444 RadosRemoveCompletionData(MultiAioCompletionImplPtr multiAioCompl,
445 CephContext *context) :
9f95a23c 446 RefCountedObject(context),
224ce89b 447 m_multiAioCompl(multiAioCompl) {};
224ce89b
WB
448};
449
450
451} // namespace {
7c673cae
FG
452
453///////////////////////// constructor /////////////////////////////
454
455libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx& ioctx, librados::IoCtxImpl *ioctx_impl) :
9f95a23c 456 m_refCnt(0), m_radosCluster(ioctx), m_ioCtx(ioctx), m_ioCtxImpl(ioctx_impl),
7c673cae
FG
457 m_layout(default_file_layout) {}
458
459///////////////////////// layout /////////////////////////////
460
461int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit
462(unsigned int stripe_unit)
463{
464 /* stripe unit must be non-zero, 64k increment */
465 if (!stripe_unit || (stripe_unit & (CEPH_MIN_STRIPE_UNIT-1)))
466 return -EINVAL;
467 m_layout.fl_stripe_unit = stripe_unit;
468 return 0;
469}
470
471int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount
472(unsigned int stripe_count)
473{
474 /* stripe count must be non-zero */
475 if (!stripe_count)
476 return -EINVAL;
477 m_layout.fl_stripe_count = stripe_count;
478 return 0;
479}
480
481int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize
482(unsigned int object_size)
483{
484 /* object size must be non-zero, 64k increment */
485 if (!object_size || (object_size & (CEPH_MIN_STRIPE_UNIT-1)))
486 return -EINVAL;
487 /* object size must be a multiple of stripe unit */
488 if (object_size < m_layout.fl_stripe_unit ||
489 object_size % m_layout.fl_stripe_unit)
490 return -EINVAL;
491 m_layout.fl_object_size = object_size;
492 return 0;
493}
494
495///////////////////////// xattrs /////////////////////////////
496
497int libradosstriper::RadosStriperImpl::getxattr(const object_t& soid,
498 const char *name,
499 bufferlist& bl)
500{
501 std::string firstObjOid = getObjectId(soid, 0);
502 return m_ioCtx.getxattr(firstObjOid, name, bl);
503}
504
505int libradosstriper::RadosStriperImpl::setxattr(const object_t& soid,
506 const char *name,
507 bufferlist& bl)
508{
509 std::string firstObjOid = getObjectId(soid, 0);
510 return m_ioCtx.setxattr(firstObjOid, name, bl);
511}
512
513int libradosstriper::RadosStriperImpl::getxattrs(const object_t& soid,
514 map<string, bufferlist>& attrset)
515{
516 std::string firstObjOid = getObjectId(soid, 0);
517 int rc = m_ioCtx.getxattrs(firstObjOid, attrset);
518 if (rc) return rc;
519 // cleanup internal attributes dedicated to striping and locking
520 attrset.erase(XATTR_LAYOUT_STRIPE_UNIT);
521 attrset.erase(XATTR_LAYOUT_STRIPE_COUNT);
522 attrset.erase(XATTR_LAYOUT_OBJECT_SIZE);
523 attrset.erase(XATTR_SIZE);
524 attrset.erase(std::string(LOCK_PREFIX) + RADOS_LOCK_NAME);
525 return rc;
526}
527
528int libradosstriper::RadosStriperImpl::rmxattr(const object_t& soid,
529 const char *name)
530{
531 std::string firstObjOid = getObjectId(soid, 0);
532 return m_ioCtx.rmxattr(firstObjOid, name);
533}
534
535///////////////////////// io /////////////////////////////
536
537int libradosstriper::RadosStriperImpl::write(const std::string& soid,
538 const bufferlist& bl,
539 size_t len,
540 uint64_t off)
541{
542 // open the object. This will create it if needed, retrieve its layout
543 // and size and take a shared lock on it
544 ceph_file_layout layout;
545 std::string lockCookie;
546 int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true);
547 if (rc) return rc;
548 return write_in_open_object(soid, layout, lockCookie, bl, len, off);
549}
550
551int libradosstriper::RadosStriperImpl::append(const std::string& soid,
552 const bufferlist& bl,
553 size_t len)
554{
555 // open the object. This will create it if needed, retrieve its layout
556 // and size and take a shared lock on it
557 ceph_file_layout layout;
558 uint64_t size = len;
559 std::string lockCookie;
560 int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false);
561 if (rc) return rc;
562 return write_in_open_object(soid, layout, lockCookie, bl, len, size);
563}
564
565int libradosstriper::RadosStriperImpl::write_full(const std::string& soid,
566 const bufferlist& bl)
567{
568 int rc = trunc(soid, 0);
569 if (rc && rc != -ENOENT) return rc; // ENOENT is obviously ok
570 return write(soid, bl, bl.length(), 0);
571}
572
573int libradosstriper::RadosStriperImpl::read(const std::string& soid,
574 bufferlist* bl,
575 size_t len,
576 uint64_t off)
577{
578 // create a completion object
579 librados::AioCompletionImpl c;
580 // call asynchronous method
581 int rc = aio_read(soid, &c, bl, len, off);
582 // and wait for completion
583 if (!rc) {
584 // wait for completion
585 c.wait_for_complete_and_cb();
586 // return result
587 rc = c.get_return_value();
588 }
589 return rc;
590}
591
592///////////////////////// asynchronous io /////////////////////////////
593
594int libradosstriper::RadosStriperImpl::aio_write(const std::string& soid,
595 librados::AioCompletionImpl *c,
596 const bufferlist& bl,
597 size_t len,
598 uint64_t off)
599{
600 ceph_file_layout layout;
601 std::string lockCookie;
602 int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true);
603 if (rc) return rc;
604 return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, off);
605}
606
607int libradosstriper::RadosStriperImpl::aio_append(const std::string& soid,
608 librados::AioCompletionImpl *c,
609 const bufferlist& bl,
610 size_t len)
611{
612 ceph_file_layout layout;
613 uint64_t size = len;
614 std::string lockCookie;
615 int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false);
616 if (rc) return rc;
617 // create a completion object
618 return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, size);
619}
620
621int libradosstriper::RadosStriperImpl::aio_write_full(const std::string& soid,
622 librados::AioCompletionImpl *c,
623 const bufferlist& bl)
624{
625 int rc = trunc(soid, 0);
626 if (rc) return rc;
627 return aio_write(soid, c, bl, bl.length(), 0);
628}
629
630static void rados_read_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg)
631{
9f95a23c 632 auto cdata = ceph::ref_t<ReadCompletionData>(static_cast<ReadCompletionData*>(arg), false);
7c673cae
FG
633 libradosstriper::MultiAioCompletionImpl *comp =
634 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
635 cdata->complete_unlock(comp->rval);
7c673cae
FG
636}
637
638static void striper_read_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
639{
9f95a23c 640 auto cdata = static_cast<ReadCompletionData*>(arg);
7c673cae
FG
641 // launch the async unlocking of the object
642 cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion);
643 // complete the read part in parallel
644 libradosstriper::MultiAioCompletionImpl *comp =
645 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
646 cdata->complete_read(comp->rval);
647}
648
7c673cae
FG
649static void rados_req_read_complete(rados_completion_t c, void *arg)
650{
9f95a23c 651 auto data = static_cast<RadosReadCompletionData*>(arg);
7c673cae
FG
652 int rc = rados_aio_get_return_value(c);
653 // We need to handle the case of sparse files here
654 if (rc == -ENOENT) {
655 // the object did not exist at all. This can happen for sparse files.
656 // we consider we've read 0 bytes and it will fall into next case
657 rc = 0;
658 }
9f95a23c 659 ssize_t nread = rc;
7c673cae
FG
660 if (rc >= 0 && (((uint64_t)rc) < data->m_expectedBytes)) {
661 // only partial data were present in the object (or the object did not
662 // even exist if we've gone through previous case).
663 // This is typical of sparse file and we need to complete with 0s.
664 unsigned int lenOfZeros = data->m_expectedBytes-rc;
20effc67 665 unsigned int existingDataToZero = std::min(data->m_bl->length()-rc, lenOfZeros);
7c673cae
FG
666 if (existingDataToZero > 0) {
667 data->m_bl->zero(rc, existingDataToZero);
668 }
669 if (lenOfZeros > existingDataToZero) {
670 ceph::bufferptr zeros(ceph::buffer::create(lenOfZeros-existingDataToZero));
671 zeros.zero();
672 data->m_bl->push_back(zeros);
673 }
9f95a23c 674 nread = data->m_expectedBytes;
7c673cae 675 }
9f95a23c
TL
676 auto multi_aio_comp = data->m_multiAioCompl;
677 multi_aio_comp->complete_request(nread);
678 multi_aio_comp->safe_request(rc);
7c673cae
FG
679}
680
681int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
682 librados::AioCompletionImpl *c,
683 bufferlist* bl,
684 size_t len,
685 uint64_t off)
686{
687 // open the object. This will retrieve its layout and size
688 // and take a shared lock on it
689 ceph_file_layout layout;
690 uint64_t size;
691 std::string lockCookie;
692 int rc = openStripedObjectForRead(soid, &layout, &size, &lockCookie);
693 if (rc) return rc;
694 // find out the actual number of bytes we can read
695 uint64_t read_len;
696 if (off >= size) {
697 // nothing to read ! We are done.
698 read_len = 0;
699 } else {
20effc67 700 read_len = std::min(len, (size_t)(size-off));
7c673cae
FG
701 }
702 // get list of extents to be read from
703 vector<ObjectExtent> *extents = new vector<ObjectExtent>();
704 if (read_len > 0) {
c07f9fc5
FG
705 std::string format = soid;
706 boost::replace_all(format, "%", "%%");
707 format += RADOS_OBJECT_EXTENSION_FORMAT;
7c673cae
FG
708 file_layout_t l;
709 l.from_legacy(layout);
710 Striper::file_to_extents(cct(), format.c_str(), &l, off, read_len,
711 0, *extents);
712 }
713
714 // create a completion object and transfer ownership of extents and resultbl
715 vector<bufferlist> *resultbl = new vector<bufferlist>(extents->size());
9f95a23c 716 auto cdata = ceph::make_ref<ReadCompletionData>(this, soid, lockCookie, c, bl, extents, resultbl);
7c673cae
FG
717 c->is_read = true;
718 c->io = m_ioCtxImpl;
719 // create a completion for the unlocking of the striped object at the end of the read
720 librados::AioCompletion *unlock_completion =
9f95a23c 721 librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_read_aio_unlock_complete);
7c673cae
FG
722 cdata->m_unlockCompletion = unlock_completion;
723 // create the multiCompletion object handling the reads
224ce89b
WB
724 MultiAioCompletionImplPtr nc{new libradosstriper::MultiAioCompletionImpl,
725 false};
9f95a23c 726 nc->set_complete_callback(cdata.get(), striper_read_aio_req_complete);
7c673cae
FG
727 // go through the extents
728 int r = 0, i = 0;
729 for (vector<ObjectExtent>::iterator p = extents->begin(); p != extents->end(); ++p) {
730 // create a buffer list describing where to place data read from current extend
731 bufferlist *oid_bl = &((*resultbl)[i++]);
732 for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
733 q != p->buffer_extents.end();
734 ++q) {
735 bufferlist buffer_bl;
736 buffer_bl.substr_of(*bl, q->first, q->second);
737 oid_bl->append(buffer_bl);
738 }
739 // read all extends of a given object in one go
740 nc->add_request();
741 // we need 2 references on data as both rados_req_read_safe and rados_req_read_complete
742 // will release one
9f95a23c 743 auto data = ceph::make_ref<RadosReadCompletionData>(nc, p->length, oid_bl, cct());
7c673cae 744 librados::AioCompletion *rados_completion =
9f95a23c 745 librados::Rados::aio_create_completion(data.detach(), rados_req_read_complete);
7c673cae
FG
746 r = m_ioCtx.aio_read(p->oid.name, rados_completion, oid_bl, p->length, p->offset);
747 rados_completion->release();
748 if (r < 0)
749 break;
750 }
751 nc->finish_adding_requests();
7c673cae
FG
752 return r;
753}
754
755int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
756 librados::AioCompletionImpl *c,
757 char* buf,
758 size_t len,
759 uint64_t off)
760{
761 // create a buffer list and store it inside the completion object
762 c->bl.clear();
763 c->bl.push_back(buffer::create_static(len, buf));
764 // call the bufferlist version of this method
765 return aio_read(soid, c, &c->bl, len, off);
766}
767
768int libradosstriper::RadosStriperImpl::aio_flush()
769{
770 int ret;
771 // pass to the rados level
772 ret = m_ioCtx.aio_flush();
773 if (ret < 0)
774 return ret;
775 //wait all CompletionData are released
9f95a23c
TL
776 std::unique_lock l{lock};
777 cond.wait(l, [this] {return m_refCnt <= 1;});
7c673cae
FG
778 return ret;
779}
780
781///////////////////////// stat and deletion /////////////////////////////
782
783int libradosstriper::RadosStriperImpl::stat(const std::string& soid, uint64_t *psize, time_t *pmtime)
784{
785 // create a completion object
786 librados::AioCompletionImpl c;
787 // call asynchronous version of stat
788 int rc = aio_stat(soid, &c, psize, pmtime);
789 if (rc == 0) {
790 // wait for completion of the remove
791 c.wait_for_complete();
792 // get result
793 rc = c.get_return_value();
794 }
795 return rc;
796}
797
798static void striper_stat_aio_stat_complete(rados_completion_t c, void *arg) {
9f95a23c 799 auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false);
7c673cae
FG
800 int rc = rados_aio_get_return_value(c);
801 if (rc == -ENOENT) {
802 // remember this has failed
803 data->m_statRC = rc;
804 }
805 data->m_multiCompletion->complete_request(rc);
7c673cae
FG
806}
807
808static void striper_stat_aio_getxattr_complete(rados_completion_t c, void *arg) {
9f95a23c 809 auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false);
7c673cae
FG
810 int rc = rados_aio_get_return_value(c);
811 // We need to handle the case of sparse files here
812 if (rc < 0) {
813 // remember this has failed
814 data->m_getxattrRC = rc;
815 } else {
816 // this intermediate string allows to add a null terminator before calling strtol
817 std::string err;
818 std::string strsize(data->m_bl.c_str(), data->m_bl.length());
819 *data->m_psize = strict_strtoll(strsize.c_str(), 10, &err);
820 if (!err.empty()) {
821 lderr(data->m_striper->cct()) << XATTR_SIZE << " : " << err << dendl;
822 data->m_getxattrRC = -EINVAL;
823 }
824 rc = 0;
825 }
826 data->m_multiCompletion->complete_request(rc);
7c673cae
FG
827}
828
829static void striper_stat_aio_req_complete(rados_striper_multi_completion_t c,
830 void *arg) {
9f95a23c 831 auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false);
7c673cae
FG
832 if (data->m_statRC) {
833 data->complete(data->m_statRC);
834 } else {
835 if (data->m_getxattrRC < 0) {
836 data->complete(data->m_getxattrRC);
837 } else {
838 data->complete(0);
839 }
840 }
7c673cae
FG
841}
842
843template<class TimeType>
844int libradosstriper::RadosStriperImpl::aio_generic_stat
845(const std::string& soid,
846 librados::AioCompletionImpl *c,
847 uint64_t *psize,
848 TimeType *pmtime,
849 typename libradosstriper::RadosStriperImpl::StatFunction<TimeType>::Type statFunction)
850{
851 // use a MultiAioCompletion object for dealing with the fact
852 // that we'll do 2 asynchronous calls in parallel
224ce89b
WB
853 MultiAioCompletionImplPtr multi_completion{
854 new libradosstriper::MultiAioCompletionImpl, false};
7c673cae
FG
855 // Data object used for passing context to asynchronous calls
856 std::string firstObjOid = getObjectId(soid, 0);
9f95a23c
TL
857 auto cdata = ceph::make_ref<StatCompletionData<TimeType>>(this, firstObjOid, c, multi_completion.get(), psize, pmtime);
858 multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_stat_aio_req_complete);
7c673cae
FG
859 // use a regular AioCompletion for the stat async call
860 librados::AioCompletion *stat_completion =
9f95a23c 861 librados::Rados::aio_create_completion(cdata->get() /* create ref! */, striper_stat_aio_stat_complete);
7c673cae
FG
862 multi_completion->add_safe_request();
863 object_t obj(firstObjOid);
864 int rc = (m_ioCtxImpl->*statFunction)(obj, stat_completion->pc,
865 &cdata->m_objectSize, cdata->m_pmtime);
866 stat_completion->release();
867 if (rc < 0) {
868 // nothing is really started so cancel everything
9f95a23c 869 delete cdata.detach();
7c673cae
FG
870 return rc;
871 }
872 // use a regular AioCompletion for the getxattr async call
873 librados::AioCompletion *getxattr_completion =
9f95a23c 874 librados::Rados::aio_create_completion(cdata->get() /* create ref! */, striper_stat_aio_getxattr_complete);
7c673cae
FG
875 multi_completion->add_safe_request();
876 // in parallel, get the pmsize from the first object asynchronously
877 rc = m_ioCtxImpl->aio_getxattr(obj, getxattr_completion->pc,
878 XATTR_SIZE, cdata->m_bl);
879 getxattr_completion->release();
880 multi_completion->finish_adding_requests();
881 if (rc < 0) {
882 // the async stat is ongoing, so we need to go on
883 // we mark the getxattr as failed in the data object
884 cdata->m_getxattrRC = rc;
885 multi_completion->complete_request(rc);
7c673cae
FG
886 return rc;
887 }
7c673cae
FG
888 return 0;
889}
890
891int libradosstriper::RadosStriperImpl::aio_stat(const std::string& soid,
892 librados::AioCompletionImpl *c,
893 uint64_t *psize,
894 time_t *pmtime)
895{
896 return aio_generic_stat<time_t>(soid, c, psize, pmtime, &librados::IoCtxImpl::aio_stat);
897}
898
899int libradosstriper::RadosStriperImpl::stat2(const std::string& soid, uint64_t *psize, struct timespec *pts)
900{
901 // create a completion object
902 librados::AioCompletionImpl c;
903 // call asynchronous version of stat
904 int rc = aio_stat2(soid, &c, psize, pts);
905 if (rc == 0) {
906 // wait for completion of the remove
907 c.wait_for_complete_and_cb();
908 // get result
909 rc = c.get_return_value();
910 }
911 return rc;
912}
913
914int libradosstriper::RadosStriperImpl::aio_stat2(const std::string& soid,
915 librados::AioCompletionImpl *c,
916 uint64_t *psize,
917 struct timespec *pts)
918{
919 return aio_generic_stat<struct timespec>(soid, c, psize, pts, &librados::IoCtxImpl::aio_stat2);
920}
921
922static void rados_req_remove_complete(rados_completion_t c, void *arg)
923{
9f95a23c 924 auto cdata = static_cast<RadosRemoveCompletionData*>(arg);
7c673cae
FG
925 int rc = rados_aio_get_return_value(c);
926 // in case the object did not exist, it means we had a sparse file, all is fine
927 if (rc == -ENOENT) {
928 rc = 0;
929 }
930 cdata->m_multiAioCompl->complete_request(rc);
7c673cae 931 cdata->m_multiAioCompl->safe_request(rc);
7c673cae
FG
932}
933
934static void striper_remove_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
935{
9f95a23c 936 auto cdata = ceph::ref_t<RemoveCompletionData>(static_cast<RemoveCompletionData*>(arg), false);
7c673cae
FG
937 libradosstriper::MultiAioCompletionImpl *comp =
938 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
939 ldout(cdata->m_striper->cct(), 10)
940 << "RadosStriperImpl : striper_remove_aio_req_complete called for "
941 << cdata->m_soid << dendl;
942 int rc = comp->rval;
943 if (rc == 0) {
944 // All went fine, synchronously remove first object
945 rc = cdata->m_striper->m_ioCtx.remove(cdata->m_striper->getObjectId(cdata->m_soid, 0),
946 cdata->flags);
947 } else {
948 lderr(cdata->m_striper->cct())
949 << "RadosStriperImpl : deletion/truncation incomplete for " << cdata->m_soid
950 << ", as errors were encountered. The file is left present but it's content "
951 << " has been partially removed"
952 << dendl;
953 }
954 cdata->complete(rc);
7c673cae
FG
955}
956
957int libradosstriper::RadosStriperImpl::remove(const std::string& soid, int flags)
958{
959 // create a completion object
960 librados::AioCompletionImpl c;
961 // call asynchronous version of remove
962 int rc = aio_remove(soid, &c, flags);
963 if (rc == 0) {
964 // wait for completion of the remove
965 c.wait_for_complete_and_cb();
966 // get result
967 rc = c.get_return_value();
968 }
969 return rc;
970}
971
972int libradosstriper::RadosStriperImpl::aio_remove(const std::string& soid,
973 librados::AioCompletionImpl *c,
974 int flags)
975{
976 // the RemoveCompletionData object will lock the given soid for the duration
977 // of the removal
978 std::string lockCookie = getUUID();
979 int rc = m_ioCtx.lock_exclusive(getObjectId(soid, 0), RADOS_LOCK_NAME, lockCookie, "", 0, 0);
980 if (rc) return rc;
981 // create CompletionData for the async remove call
9f95a23c 982 auto cdata = ceph::make_ref<RemoveCompletionData>(this, soid, lockCookie, c, flags);
224ce89b
WB
983 MultiAioCompletionImplPtr multi_completion{
984 new libradosstriper::MultiAioCompletionImpl, false};
9f95a23c 985 multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_remove_aio_req_complete);
7c673cae
FG
986 // call asynchronous internal version of remove
987 ldout(cct(), 10)
988 << "RadosStriperImpl : Aio_remove starting for "
989 << soid << dendl;
990 rc = internal_aio_remove(soid, multi_completion);
7c673cae
FG
991 return rc;
992}
993
224ce89b
WB
994int libradosstriper::RadosStriperImpl::internal_aio_remove(
995 const std::string& soid,
996 MultiAioCompletionImplPtr multi_completion,
7c673cae
FG
997 int flags)
998{
999 std::string firstObjOid = getObjectId(soid, 0);
1000 try {
1001 // check size and get number of rados objects to delete
1002 uint64_t nb_objects = 0;
1003 bufferlist bl2;
1004 int rc = getxattr(soid, XATTR_SIZE, bl2);
1005 if (rc < 0) {
1006 // no object size (or not able to get it)
1007 // try to find the number of object "by hand"
1008 uint64_t psize;
1009 time_t pmtime;
1010 while (!m_ioCtx.stat(getObjectId(soid, nb_objects), &psize, &pmtime)) {
1011 nb_objects++;
1012 }
1013 } else {
1014 // count total number of rados objects in the striped object
1015 std::string err;
1016 // this intermediate string allows to add a null terminator before calling strtol
1017 std::string strsize(bl2.c_str(), bl2.length());
1018 uint64_t size = strict_strtoll(strsize.c_str(), 10, &err);
1019 if (!err.empty()) {
1020 lderr(cct()) << XATTR_SIZE << " : " << err << dendl;
1021
1022 return -EINVAL;
1023 }
1024 uint64_t object_size = m_layout.fl_object_size;
1025 uint64_t su = m_layout.fl_stripe_unit;
1026 uint64_t stripe_count = m_layout.fl_stripe_count;
1027 uint64_t nb_complete_sets = size / (object_size*stripe_count);
1028 uint64_t remaining_data = size % (object_size*stripe_count);
1029 uint64_t remaining_stripe_units = (remaining_data + su -1) / su;
1030 uint64_t remaining_objects = std::min(remaining_stripe_units, stripe_count);
1031 nb_objects = nb_complete_sets * stripe_count + remaining_objects;
1032 }
1033 // delete rados objects in reverse order
1034 // Note that we do not drop the first object. This one will only be dropped
1035 // if all other removals have been successful, and this is done in the
1036 // callback of the multi_completion object
1037 int rcr = 0;
1038 for (int i = nb_objects-1; i >= 1; i--) {
1039 multi_completion->add_request();
9f95a23c 1040 auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct());
7c673cae 1041 librados::AioCompletion *rados_completion =
9f95a23c
TL
1042 librados::Rados::aio_create_completion(data->get() /* create ref! */,
1043 rados_req_remove_complete);
7c673cae
FG
1044 if (flags == 0) {
1045 rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion);
1046 } else {
1047 rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion, flags);
1048 }
1049 rados_completion->release();
1050 if (rcr < 0 and -ENOENT != rcr) {
1051 lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid
1052 << ", as " << getObjectId(soid, i) << " could not be deleted (rc=" << rc << ")"
1053 << dendl;
1054 break;
1055 }
1056 }
1057 // we are over adding requests to the multi_completion object
1058 multi_completion->finish_adding_requests();
1059 // return
1060 return rcr;
1061 } catch (ErrorCode &e) {
11fdf7f2 1062 // error caught when trying to take the exclusive lock
7c673cae
FG
1063 return e.m_code;
1064 }
1065
1066}
1067
1068int libradosstriper::RadosStriperImpl::trunc(const std::string& soid, uint64_t size)
1069{
1070 // lock the object in exclusive mode
1071 std::string firstObjOid = getObjectId(soid, 0);
1072 librados::ObjectWriteOperation op;
1073 op.assert_exists();
1074 std::string lockCookie = RadosStriperImpl::getUUID();
1075 utime_t dur = utime_t();
f67539c2 1076 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::EXCLUSIVE, lockCookie, "", "", dur, 0);
7c673cae
FG
1077 int rc = m_ioCtx.operate(firstObjOid, &op);
1078 if (rc) return rc;
1079 // load layout and size
1080 ceph_file_layout layout;
1081 uint64_t original_size;
1082 rc = internal_get_layout_and_size(firstObjOid, &layout, &original_size);
1083 if (!rc) {
1084 if (size < original_size) {
1085 rc = truncate(soid, original_size, size, layout);
1086 } else if (size > original_size) {
1087 rc = grow(soid, original_size, size, layout);
1088 }
1089 }
1090 // unlock object, ignore return code as we cannot do much
1091 m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie);
1092 // final return
1093 return rc;
1094}
1095
1096
1097///////////////////////// private helpers /////////////////////////////
1098
1099std::string libradosstriper::RadosStriperImpl::getObjectId(const object_t& soid,
1100 long long unsigned objectno)
1101{
1102 std::ostringstream s;
1103 s << soid << '.' << std::setfill ('0') << std::setw(16) << std::hex << objectno;
1104 return s.str();
1105}
1106
1107void libradosstriper::RadosStriperImpl::unlockObject(const std::string& soid,
1108 const std::string& lockCookie)
1109{
1110 // unlock the shared lock on the first rados object
1111 std::string firstObjOid = getObjectId(soid, 0);
1112 m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie);
1113}
1114
1115void libradosstriper::RadosStriperImpl::aio_unlockObject(const std::string& soid,
1116 const std::string& lockCookie,
1117 librados::AioCompletion *c)
1118{
1119 // unlock the shared lock on the first rados object
1120 std::string firstObjOid = getObjectId(soid, 0);
1121 m_ioCtx.aio_unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie, c);
1122}
1123
1124static void rados_write_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg)
1125{
9f95a23c 1126 auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false);
7c673cae
FG
1127 libradosstriper::MultiAioCompletionImpl *comp =
1128 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1129 cdata->complete_unlock(comp->rval);
7c673cae
FG
1130}
1131
1132static void striper_write_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
1133{
9f95a23c 1134 auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false);
7c673cae
FG
1135 // launch the async unlocking of the object
1136 cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion);
1137 // complete the write part in parallel
1138 libradosstriper::MultiAioCompletionImpl *comp =
1139 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1140 cdata->complete_write(comp->rval);
7c673cae
FG
1141}
1142
1143static void striper_write_aio_req_safe(rados_striper_multi_completion_t c, void *arg)
1144{
9f95a23c 1145 auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false);
7c673cae
FG
1146 libradosstriper::MultiAioCompletionImpl *comp =
1147 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1148 cdata->safe(comp->rval);
7c673cae
FG
1149}
1150
1151int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string& soid,
1152 const ceph_file_layout& layout,
1153 const std::string& lockCookie,
1154 const bufferlist& bl,
1155 size_t len,
1156 uint64_t off) {
1157 // create a completion object to be passed to the callbacks of the multicompletion
1158 // we need 3 references as striper_write_aio_req_complete will release two and
1159 // striper_write_aio_req_safe will release one
9f95a23c 1160 auto cdata = ceph::make_ref<WriteCompletionData>(this, soid, lockCookie, nullptr);
7c673cae
FG
1161 // create a completion object for the unlocking of the striped object at the end of the write
1162 librados::AioCompletion *unlock_completion =
9f95a23c 1163 librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_write_aio_unlock_complete);
7c673cae
FG
1164 cdata->m_unlockCompletion = unlock_completion;
1165 // create the multicompletion that will handle the write completion
224ce89b
WB
1166 MultiAioCompletionImplPtr c{new libradosstriper::MultiAioCompletionImpl,
1167 false};
9f95a23c
TL
1168 c->set_complete_callback(cdata->get() /* create ref! */, striper_write_aio_req_complete);
1169 c->set_safe_callback(cdata->get() /* create ref! */, striper_write_aio_req_safe);
7c673cae
FG
1170 // call the asynchronous API
1171 int rc = internal_aio_write(soid, c, bl, len, off, layout);
1172 if (!rc) {
1173 // wait for completion and safety of data
1174 c->wait_for_complete_and_cb();
1175 c->wait_for_safe_and_cb();
1176 // wait for the unlocking
1177 unlock_completion->wait_for_complete();
1178 // return result
1179 rc = c->get_return_value();
1180 }
7c673cae
FG
1181 return rc;
1182}
1183
1184int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string& soid,
1185 librados::AioCompletionImpl *c,
1186 const ceph_file_layout& layout,
1187 const std::string& lockCookie,
1188 const bufferlist& bl,
1189 size_t len,
1190 uint64_t off) {
1191 // create a completion object to be passed to the callbacks of the multicompletion
1192 // we need 3 references as striper_write_aio_req_complete will release two and
1193 // striper_write_aio_req_safe will release one
9f95a23c 1194 auto cdata = ceph::make_ref<WriteCompletionData>(this, soid, lockCookie, c);
7c673cae
FG
1195 m_ioCtxImpl->get();
1196 c->io = m_ioCtxImpl;
1197 // create a completion object for the unlocking of the striped object at the end of the write
1198 librados::AioCompletion *unlock_completion =
9f95a23c 1199 librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_write_aio_unlock_complete);
7c673cae
FG
1200 cdata->m_unlockCompletion = unlock_completion;
1201 // create the multicompletion that will handle the write completion
224ce89b
WB
1202 libradosstriper::MultiAioCompletionImplPtr nc{
1203 new libradosstriper::MultiAioCompletionImpl, false};
9f95a23c
TL
1204 nc->set_complete_callback(cdata->get() /* create ref! */, striper_write_aio_req_complete);
1205 nc->set_safe_callback(cdata->get() /* create ref! */, striper_write_aio_req_safe);
7c673cae
FG
1206 // internal asynchronous API
1207 int rc = internal_aio_write(soid, nc, bl, len, off, layout);
7c673cae
FG
1208 return rc;
1209}
1210
7c673cae
FG
1211static void rados_req_write_complete(rados_completion_t c, void *arg)
1212{
9f95a23c 1213 auto comp = reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg);
7c673cae 1214 comp->complete_request(rados_aio_get_return_value(c));
9f95a23c 1215 comp->safe_request(rados_aio_get_return_value(c));
7c673cae
FG
1216}
1217
1218int
1219libradosstriper::RadosStriperImpl::internal_aio_write(const std::string& soid,
224ce89b 1220 libradosstriper::MultiAioCompletionImplPtr c,
7c673cae
FG
1221 const bufferlist& bl,
1222 size_t len,
1223 uint64_t off,
1224 const ceph_file_layout& layout)
1225{
1226 int r = 0;
1227 // Do not try anything if we are called with empty buffer,
1228 // file_to_extents would raise an exception
1229 if (len > 0) {
1230 // get list of extents to be written to
1231 vector<ObjectExtent> extents;
c07f9fc5
FG
1232 std::string format = soid;
1233 boost::replace_all(format, "%", "%%");
1234 format += RADOS_OBJECT_EXTENSION_FORMAT;
7c673cae
FG
1235 file_layout_t l;
1236 l.from_legacy(layout);
1237 Striper::file_to_extents(cct(), format.c_str(), &l, off, len, 0, extents);
1238 // go through the extents
1239 for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
1240 // assemble pieces of a given object into a single buffer list
1241 bufferlist oid_bl;
1242 for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
1243 q != p->buffer_extents.end();
1244 ++q) {
1245 bufferlist buffer_bl;
1246 buffer_bl.substr_of(bl, q->first, q->second);
1247 oid_bl.append(buffer_bl);
1248 }
1249 // and write the object
1250 c->add_request();
1251 librados::AioCompletion *rados_completion =
224ce89b 1252 librados::Rados::aio_create_completion(c.get(),
9f95a23c 1253 rados_req_write_complete);
224ce89b
WB
1254 r = m_ioCtx.aio_write(p->oid.name, rados_completion, oid_bl,
1255 p->length, p->offset);
7c673cae
FG
1256 rados_completion->release();
1257 if (r < 0)
1258 break;
1259 }
1260 }
1261 c->finish_adding_requests();
1262 return r;
1263}
1264
1265int libradosstriper::RadosStriperImpl::extract_uint32_attr
1266(std::map<std::string, bufferlist> &attrs,
1267 const std::string& key,
1268 ceph_le32 *value)
1269{
1270 std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key);
1271 if (attrsIt != attrs.end()) {
1272 // this intermediate string allows to add a null terminator before calling strtol
1273 std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length());
1274 std::string err;
1275 *value = strict_strtol(strvalue.c_str(), 10, &err);
1276 if (!err.empty()) {
1277 lderr(cct()) << key << " : " << err << dendl;
1278 return -EINVAL;
1279 }
1280 } else {
1281 return -ENOENT;
1282 }
1283 return 0;
1284}
1285
1286int libradosstriper::RadosStriperImpl::extract_sizet_attr
1287(std::map<std::string, bufferlist> &attrs,
1288 const std::string& key,
1289 size_t *value)
1290{
1291 std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key);
1292 if (attrsIt != attrs.end()) {
1293 // this intermediate string allows to add a null terminator before calling strtol
1294 std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length());
1295 std::string err;
1296 *value = strict_strtoll(strvalue.c_str(), 10, &err);
1297 if (!err.empty()) {
1298 lderr(cct()) << key << " : " << err << dendl;
1299 return -EINVAL;
1300 }
1301 } else {
1302 return -ENOENT;
1303 }
1304 return 0;
1305}
1306
1307int libradosstriper::RadosStriperImpl::internal_get_layout_and_size(
1308 const std::string& oid,
1309 ceph_file_layout *layout,
1310 uint64_t *size)
1311{
1312 // get external attributes of the first rados object
1313 std::map<std::string, bufferlist> attrs;
1314 int rc = m_ioCtx.getxattrs(oid, attrs);
1315 if (rc) return rc;
1316 // deal with stripe_unit
1317 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_UNIT, &layout->fl_stripe_unit);
1318 if (rc) return rc;
1319 // deal with stripe_count
1320 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_COUNT, &layout->fl_stripe_count);
1321 if (rc) return rc;
1322 // deal with object_size
1323 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_OBJECT_SIZE, &layout->fl_object_size);
1324 if (rc) return rc;
1325 // deal with size
1326 size_t ssize;
1327 rc = extract_sizet_attr(attrs, XATTR_SIZE, &ssize);
1328 if (rc) {
1329 return rc;
1330 }
1331 *size = ssize;
1332 // make valgrind happy by setting unused fl_pg_pool
1333 layout->fl_pg_pool = 0;
1334 return 0;
1335}
1336
1337int libradosstriper::RadosStriperImpl::openStripedObjectForRead(
1338 const std::string& soid,
1339 ceph_file_layout *layout,
1340 uint64_t *size,
1341 std::string *lockCookie)
1342{
1343 // take a lock the first rados object, if it exists and gets its size
1344 // check, lock and size reading must be atomic and are thus done within a single operation
1345 librados::ObjectWriteOperation op;
1346 op.assert_exists();
1347 *lockCookie = getUUID();
1348 utime_t dur = utime_t();
f67539c2 1349 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::SHARED, *lockCookie, "Tag", "", dur, 0);
7c673cae
FG
1350 std::string firstObjOid = getObjectId(soid, 0);
1351 int rc = m_ioCtx.operate(firstObjOid, &op);
1352 if (rc) {
1353 // error case (including -ENOENT)
1354 return rc;
1355 }
1356 rc = internal_get_layout_and_size(firstObjOid, layout, size);
1357 if (rc) {
1358 unlockObject(soid, *lockCookie);
1359 lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : "
1360 << "could not load layout and size for "
1361 << soid << " : rc = " << rc << dendl;
1362 }
1363 return rc;
1364}
1365
1366int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string& soid,
1367 ceph_file_layout *layout,
1368 uint64_t *size,
1369 std::string *lockCookie,
1370 bool isFileSizeAbsolute)
1371{
1372 // take a lock the first rados object, if it exists
1373 // check and lock must be atomic and are thus done within a single operation
1374 librados::ObjectWriteOperation op;
1375 op.assert_exists();
1376 *lockCookie = getUUID();
1377 utime_t dur = utime_t();
f67539c2 1378 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::SHARED, *lockCookie, "Tag", "", dur, 0);
7c673cae
FG
1379 std::string firstObjOid = getObjectId(soid, 0);
1380 int rc = m_ioCtx.operate(firstObjOid, &op);
1381 if (rc) {
1382 if (rc == -ENOENT) {
1383 // object does not exist, delegate to createEmptyStripedObject
1384 int rc = createAndOpenStripedObject(soid, layout, *size, lockCookie, isFileSizeAbsolute);
1385 // return original size
1386 *size = 0;
1387 return rc;
1388 } else {
1389 return rc;
1390 }
1391 }
1392 // all fine
1393 uint64_t curSize;
1394 rc = internal_get_layout_and_size(firstObjOid, layout, &curSize);
1395 if (rc) {
1396 unlockObject(soid, *lockCookie);
1397 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1398 << "could not load layout and size for "
1399 << soid << " : rc = " << rc << dendl;
1400 return rc;
1401 }
1402 // atomically update object size, only if smaller than current one
1403 if (!isFileSizeAbsolute)
1404 *size += curSize;
1405 librados::ObjectWriteOperation writeOp;
1406 writeOp.cmpxattr(XATTR_SIZE, LIBRADOS_CMPXATTR_OP_GT, *size);
1407 std::ostringstream oss;
1408 oss << *size;
1409 bufferlist bl;
1410 bl.append(oss.str());
1411 writeOp.setxattr(XATTR_SIZE, bl);
1412 rc = m_ioCtx.operate(firstObjOid, &writeOp);
1413 // return current size
1414 *size = curSize;
1415 // handle case where objectsize is already bigger than size
1416 if (-ECANCELED == rc)
1417 rc = 0;
1418 if (rc) {
1419 unlockObject(soid, *lockCookie);
1420 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1421 << "could not set new size for "
1422 << soid << " : rc = " << rc << dendl;
1423 }
1424 return rc;
1425}
1426
1427int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string& soid,
1428 ceph_file_layout *layout,
1429 uint64_t size,
1430 std::string *lockCookie,
1431 bool isFileSizeAbsolute)
1432{
1433 // build atomic write operation
1434 librados::ObjectWriteOperation writeOp;
1435 writeOp.create(true);
1436 // object_size
1437 std::ostringstream oss_object_size;
1438 oss_object_size << m_layout.fl_object_size;
1439 bufferlist bl_object_size;
1440 bl_object_size.append(oss_object_size.str());
1441 writeOp.setxattr(XATTR_LAYOUT_OBJECT_SIZE, bl_object_size);
1442 // stripe unit
1443 std::ostringstream oss_stripe_unit;
1444 oss_stripe_unit << m_layout.fl_stripe_unit;
1445 bufferlist bl_stripe_unit;
1446 bl_stripe_unit.append(oss_stripe_unit.str());
1447 writeOp.setxattr(XATTR_LAYOUT_STRIPE_UNIT, bl_stripe_unit);
1448 // stripe count
1449 std::ostringstream oss_stripe_count;
1450 oss_stripe_count << m_layout.fl_stripe_count;
1451 bufferlist bl_stripe_count;
1452 bl_stripe_count.append(oss_stripe_count.str());
1453 writeOp.setxattr(XATTR_LAYOUT_STRIPE_COUNT, bl_stripe_count);
1454 // size
1455 std::ostringstream oss_size;
1456 oss_size << (isFileSizeAbsolute?size:0);
1457 bufferlist bl_size;
1458 bl_size.append(oss_size.str());
1459 writeOp.setxattr(XATTR_SIZE, bl_size);
1460 // effectively change attributes
1461 std::string firstObjOid = getObjectId(soid, 0);
1462 int rc = m_ioCtx.operate(firstObjOid, &writeOp);
1463 // in case of error (but no EEXIST which would mean the object existed), return
1464 if (rc && -EEXIST != rc) return rc;
1465 // Otherwise open the object
1466 uint64_t fileSize = size;
1467 return openStripedObjectForWrite(soid, layout, &fileSize, lockCookie, isFileSizeAbsolute);
1468}
1469
1470static void striper_truncate_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
1471{
9f95a23c 1472 auto cdata = ceph::ref_t<TruncateCompletionData>(static_cast<TruncateCompletionData*>(arg), false);
7c673cae
FG
1473 libradosstriper::MultiAioCompletionImpl *comp =
1474 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1475 if (0 == comp->rval) {
1476 // all went fine, change size in the external attributes
1477 std::ostringstream oss;
1478 oss << cdata->m_size;
1479 bufferlist bl;
1480 bl.append(oss.str());
1481 cdata->m_striper->setxattr(cdata->m_soid, XATTR_SIZE, bl);
1482 }
7c673cae
FG
1483}
1484
1485int libradosstriper::RadosStriperImpl::truncate(const std::string& soid,
1486 uint64_t original_size,
1487 uint64_t size,
1488 ceph_file_layout &layout)
1489{
9f95a23c 1490 auto cdata = ceph::make_ref<TruncateCompletionData>(this, soid, size);
224ce89b
WB
1491 libradosstriper::MultiAioCompletionImplPtr multi_completion{
1492 new libradosstriper::MultiAioCompletionImpl, false};
9f95a23c 1493 multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_truncate_aio_req_complete);
7c673cae
FG
1494 // call asynchrous version of truncate
1495 int rc = aio_truncate(soid, multi_completion, original_size, size, layout);
1496 // wait for completion of the truncation
1497 multi_completion->finish_adding_requests();
1498 multi_completion->wait_for_complete_and_cb();
1499 // return result
1500 if (rc == 0) {
1501 rc = multi_completion->get_return_value();
1502 }
7c673cae
FG
1503 return rc;
1504}
1505
1506int libradosstriper::RadosStriperImpl::aio_truncate
1507(const std::string& soid,
224ce89b 1508 libradosstriper::MultiAioCompletionImplPtr multi_completion,
7c673cae
FG
1509 uint64_t original_size,
1510 uint64_t size,
1511 ceph_file_layout &layout)
1512{
1513 // handle the underlying rados objects. 3 cases here :
1514 // -- the objects belonging to object sets entirely located
1515 // before the truncation are unchanged
1516 // -- the objects belonging to the object set where the
1517 // truncation took place are truncated or removed
1518 // -- the objects belonging to object sets entirely located
1519 // after the truncation are removed
1520 // Note that we do it backward and that we change the size in
1521 // the external attributes only at the end. This make sure that
1522 // no rados object stays behind if we remove the striped object
1523 // after a truncation has failed
1524 uint64_t trunc_objectsetno = size / layout.fl_object_size / layout.fl_stripe_count;
1525 uint64_t last_objectsetno = original_size / layout.fl_object_size / layout.fl_stripe_count;
1526 bool exists = false;
1527 for (int64_t objectno = (last_objectsetno+1) * layout.fl_stripe_count-1;
1528 objectno >= (int64_t)((trunc_objectsetno + 1) * layout.fl_stripe_count);
1529 objectno--) {
1530 // if no object existed so far, check object existence
1531 if (!exists) {
1532 uint64_t nb_full_object_set = objectno / layout.fl_stripe_count;
1533 uint64_t object_index_in_set = objectno % layout.fl_stripe_count;
1534 uint64_t set_start_off = nb_full_object_set * layout.fl_object_size * layout.fl_stripe_count;
1535 uint64_t object_start_off = set_start_off + object_index_in_set * layout.fl_stripe_unit;
1536 exists = (original_size > object_start_off);
1537 }
1538 if (exists) {
1539 // remove asynchronously
1540 multi_completion->add_request();
9f95a23c 1541 auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct());
7c673cae 1542 librados::AioCompletion *rados_completion =
9f95a23c
TL
1543 librados::Rados::aio_create_completion(data->get() /* create ref! */,
1544 rados_req_remove_complete);
7c673cae
FG
1545 int rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion);
1546 rados_completion->release();
1547 // in case the object did not exist, it means we had a sparse file, all is fine
1548 if (rc && rc != -ENOENT) return rc;
1549 }
1550 }
1551 for (int64_t objectno = ((trunc_objectsetno + 1) * layout.fl_stripe_count) -1;
1552 objectno >= (int64_t)(trunc_objectsetno * layout.fl_stripe_count);
1553 objectno--) {
1554 // if no object existed so far, check object existence
1555 if (!exists) {
1556 uint64_t object_start_off = ((objectno / layout.fl_stripe_count) * layout.fl_object_size) +
1557 ((objectno % layout.fl_stripe_count) * layout.fl_stripe_unit);
1558 exists = (original_size > object_start_off);
1559 }
1560 if (exists) {
1561 // truncate
1562 file_layout_t l;
1563 l.from_legacy(layout);
1564 uint64_t new_object_size = Striper::object_truncate_size(cct(), &l, objectno, size);
1565 int rc;
1566 if (new_object_size > 0 or 0 == objectno) {
1567 // trunc is synchronous as there is no async version
1568 // but note that only a single object will be truncated
1569 // reducing the overload to a fixed amount
1570 rc = m_ioCtx.trunc(getObjectId(soid, objectno), new_object_size);
1571 } else {
1572 // removes are asynchronous in order to speed up truncations of big files
1573 multi_completion->add_request();
9f95a23c 1574 auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct());
7c673cae 1575 librados::AioCompletion *rados_completion =
9f95a23c
TL
1576 librados::Rados::aio_create_completion(data->get() /* create ref! */,
1577 rados_req_remove_complete);
7c673cae
FG
1578 rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion);
1579 rados_completion->release();
1580 }
1581 // in case the object did not exist, it means we had a sparse file, all is fine
1582 if (rc && rc != -ENOENT) return rc;
1583 }
1584 }
1585 return 0;
1586}
1587
1588int libradosstriper::RadosStriperImpl::grow(const std::string& soid,
1589 uint64_t original_size,
1590 uint64_t size,
1591 ceph_file_layout &layout)
1592{
1593 // handle the underlying rados objects. As we support sparse objects,
1594 // we only have to change the size in the external attributes
1595 std::ostringstream oss;
1596 oss << size;
1597 bufferlist bl;
1598 bl.append(oss.str());
1599 int rc = m_ioCtx.setxattr(getObjectId(soid, 0), XATTR_SIZE, bl);
1600 return rc;
1601}
1602
1603std::string libradosstriper::RadosStriperImpl::getUUID()
1604{
1605 struct uuid_d uuid;
1606 uuid.generate_random();
1607 char suuid[37];
1608 uuid.print(suuid);
1609 return std::string(suuid);
1610}