]> git.proxmox.com Git - ceph.git/blame - ceph/src/libradosstriper/RadosStriperImpl.cc
buildsys: switch source download to quincy
[ceph.git] / ceph / src / libradosstriper / RadosStriperImpl.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
c07f9fc5
FG
15#include <boost/algorithm/string/replace.hpp>
16
7c673cae
FG
17#include "libradosstriper/RadosStriperImpl.h"
18
19#include <errno.h>
20
21#include <sstream>
22#include <iomanip>
23#include <algorithm>
24
25#include "include/types.h"
26#include "include/uuid.h"
27#include "include/ceph_fs.h"
28#include "common/dout.h"
29#include "common/strtol.h"
9f95a23c 30#include "common/RefCountedObj.h"
7c673cae 31#include "osdc/Striper.h"
7c673cae
FG
32#include "librados/AioCompletionImpl.h"
33#include <cls/lock/cls_lock_client.h>
34
35/*
36 * This file contents the actual implementation of the rados striped objects interface.
37 *
38 * Striped objects are stored in rados in a set of regular rados objects, after their
39 * content has been striped using the osdc/Striper interface.
40 *
41 * The external attributes of the striped object are mapped to the attributes of the
42 * first underlying object. This first object has a set of extra external attributes
43 * storing the layout of the striped object for future read back. These attributes are :
44 * - striper.layout.object_size : the size of rados objects used.
45 * Must be a multiple of striper.layout.stripe_unit
46 * - striper.layout.stripe_unit : the size of a stripe unit
47 * - striper.layout.stripe_count : the number of stripes used
48 * - striper.size : total striped object size
49 *
50 * In general operations on striped objects are not atomic.
51 * However, a certain number of safety guards have been put to make the interface closer
52 * to atomicity :
53 * - each data operation takes a shared lock on the first rados object for the
54 * whole time of the operation
55 * - the remove and trunc operations take an exclusive lock on the first rados object
56 * for the whole time of the operation
57 * This makes sure that no removal/truncation of a striped object occurs while
58 * data operations are happening and vice versa. It thus makes sure that the layout
59 * of a striped object does not change during data operation, which is essential for
60 * data consistency.
61 *
62 * Still the writing to a striped object is not atomic. This means in particular that
63 * the size of an object may not be in sync with its content at all times.
11fdf7f2 64 * As the size is always guaranteed to be updated first and in an atomic way, and as
7c673cae
FG
65 * sparse striped objects are supported (see below), what will typically happen is
66 * that a reader that comes too soon after a write will read 0s instead of the actual
67 * data.
68 *
69 * Note that remove handles the pieces of the striped object in reverse order,
70 * so that the head object is removed last, making the completion of the deletion atomic.
71 *
72 * Striped objects can be sparse, typically in case data was written at the end of the
73 * striped object only. In such a case, some rados objects constituing the striped object
74 * may be missing. Other can be partial (only the beginning will have data)
75 * When dealing with such sparse striped files, missing objects are detected and
76 * considered as full of 0s. They are however not created until real data is written
77 * to them.
78 *
79 * There are a number of missing features/improvements that could be implemented.
80 * Here are some ideas :
81 * - implementation of missing entry points (compared to rados)
82 * In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ...
83 *
84 */
85
86#define dout_subsys ceph_subsys_rados
87#undef dout_prefix
88#define dout_prefix *_dout << "libradosstriper: "
89
90/// size of xattr buffer
91#define XATTR_BUFFER_SIZE 32
92
93/// names of the different xattr entries
94#define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit"
95#define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count"
96#define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size"
97#define XATTR_SIZE "striper.size"
98#define LOCK_PREFIX "lock."
99
100/// name of the lock used on objects to ensure layout stability during IO
101#define RADOS_LOCK_NAME "striper.lock"
102
103/// format of the extension of rados objects created for a given striped object
104#define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx"
105
106/// default object layout
107struct ceph_file_layout default_file_layout = {
108 init_le32(1<<22), // fl_stripe_unit
109 init_le32(1), // fl_stripe_count
110 init_le32(1<<22), // fl_object_size
111 init_le32(0), // fl_cas_hash
112 init_le32(0), // fl_object_stripe_unit
113 init_le32(-1), // fl_unused
114 init_le32(-1), // fl_pg_pool
115};
116
224ce89b
WB
117using libradosstriper::MultiAioCompletionImplPtr;
118
119namespace {
7c673cae
FG
120
121///////////////////////// CompletionData /////////////////////////////
122
224ce89b
WB
123/**
124 * struct handling the data needed to pass to the call back
125 * function in asynchronous operations
126 */
127struct CompletionData : RefCountedObject {
224ce89b
WB
128 /// complete method
129 void complete(int r);
130 /// striper to be used to handle the write completion
131 libradosstriper::RadosStriperImpl *m_striper;
132 /// striped object concerned by the write operation
133 std::string m_soid;
134 /// shared lock to be released at completion
135 std::string m_lockCookie;
136 /// completion handler
137 librados::IoCtxImpl::C_aio_Complete *m_ack;
9f95a23c
TL
138protected:
139 CompletionData(libradosstriper::RadosStriperImpl * striper,
140 const std::string& soid,
141 const std::string& lockCookie,
142 librados::AioCompletionImpl *userCompletion = 0);
143 ~CompletionData() override;
144
224ce89b
WB
145};
146
147CompletionData::CompletionData
7c673cae
FG
148(libradosstriper::RadosStriperImpl* striper,
149 const std::string& soid,
150 const std::string& lockCookie,
9f95a23c
TL
151 librados::AioCompletionImpl *userCompletion) :
152 RefCountedObject(striper->cct()),
7c673cae
FG
153 m_striper(striper), m_soid(soid), m_lockCookie(lockCookie), m_ack(0) {
154 m_striper->get();
155 if (userCompletion) {
156 m_ack = new librados::IoCtxImpl::C_aio_Complete(userCompletion);
157 userCompletion->io = striper->m_ioCtxImpl;
158 }
159}
160
224ce89b 161CompletionData::~CompletionData() {
7c673cae
FG
162 if (m_ack) delete m_ack;
163 m_striper->put();
164}
165
224ce89b 166void CompletionData::complete(int r) {
7c673cae
FG
167 if (m_ack) m_ack->finish(r);
168}
169
224ce89b
WB
170/**
171 * struct handling the data needed to pass to the call back
172 * function in asynchronous read operations
173 */
174struct ReadCompletionData : CompletionData {
175 /// bufferlist containing final result
176 bufferlist* m_bl;
177 /// extents that will be read
178 std::vector<ObjectExtent>* m_extents;
179 /// intermediate results
180 std::vector<bufferlist>* m_resultbl;
181 /// return code of read completion, to be remembered until unlocking happened
182 int m_readRc;
183 /// completion object for the unlocking of the striped object at the end of the read
184 librados::AioCompletion *m_unlockCompletion;
9f95a23c
TL
185 /// complete method for when reading is over
186 void complete_read(int r);
187 /// complete method for when object is unlocked
188 void complete_unlock(int r);
189
190private:
191 FRIEND_MAKE_REF(ReadCompletionData);
224ce89b
WB
192 ReadCompletionData(libradosstriper::RadosStriperImpl * striper,
193 const std::string& soid,
194 const std::string& lockCookie,
195 librados::AioCompletionImpl *userCompletion,
196 bufferlist* bl,
197 std::vector<ObjectExtent>* extents,
9f95a23c 198 std::vector<bufferlist>* resultbl);
224ce89b 199 ~ReadCompletionData() override;
224ce89b
WB
200};
201
202ReadCompletionData::ReadCompletionData
7c673cae
FG
203(libradosstriper::RadosStriperImpl* striper,
204 const std::string& soid,
205 const std::string& lockCookie,
206 librados::AioCompletionImpl *userCompletion,
207 bufferlist* bl,
208 std::vector<ObjectExtent>* extents,
9f95a23c
TL
209 std::vector<bufferlist>* resultbl) :
210 CompletionData(striper, soid, lockCookie, userCompletion),
7c673cae
FG
211 m_bl(bl), m_extents(extents), m_resultbl(resultbl), m_readRc(0),
212 m_unlockCompletion(0) {}
213
224ce89b 214ReadCompletionData::~ReadCompletionData() {
7c673cae
FG
215 m_unlockCompletion->release();
216 delete m_extents;
217 delete m_resultbl;
218}
219
224ce89b 220void ReadCompletionData::complete_read(int r) {
7c673cae
FG
221 // gather data into final buffer
222 Striper::StripedReadResult readResult;
223 vector<bufferlist>::iterator bit = m_resultbl->begin();
224 for (vector<ObjectExtent>::iterator eit = m_extents->begin();
225 eit != m_extents->end();
226 ++eit, ++bit) {
227 readResult.add_partial_result(m_striper->cct(), *bit, eit->buffer_extents);
228 }
229 m_bl->clear();
230 readResult.assemble_result(m_striper->cct(), *m_bl, true);
231 // Remember return code
232 m_readRc = r;
233}
234
224ce89b 235void ReadCompletionData::complete_unlock(int r) {
7c673cae
FG
236 // call parent's completion method
237 // Note that we ignore the return code of the unlock as we cannot do much about it
238 CompletionData::complete(m_readRc?m_readRc:m_bl->length());
239}
240
224ce89b
WB
241/**
242 * struct handling the data needed to pass to the call back
243 * function in asynchronous write operations
244 */
245struct WriteCompletionData : CompletionData {
246 /// safe completion handler
247 librados::IoCtxImpl::C_aio_Complete *m_safe;
224ce89b
WB
248 /// completion object for the unlocking of the striped object at the end of the write
249 librados::AioCompletion *m_unlockCompletion;
11fdf7f2
TL
250 /// return code of write completion, to be remembered until unlocking happened
251 int m_writeRc;
224ce89b
WB
252 /// complete method for when writing is over
253 void complete_write(int r);
254 /// complete method for when object is unlocked
255 void complete_unlock(int r);
256 /// safe method
257 void safe(int r);
9f95a23c
TL
258private:
259 FRIEND_MAKE_REF(WriteCompletionData);
260 /// constructor
261 WriteCompletionData(libradosstriper::RadosStriperImpl * striper,
262 const std::string& soid,
263 const std::string& lockCookie,
264 librados::AioCompletionImpl *userCompletion);
265 /// destructor
266 ~WriteCompletionData() override;
224ce89b
WB
267};
268
269WriteCompletionData::WriteCompletionData
7c673cae
FG
270(libradosstriper::RadosStriperImpl* striper,
271 const std::string& soid,
272 const std::string& lockCookie,
9f95a23c
TL
273 librados::AioCompletionImpl *userCompletion) :
274 CompletionData(striper, soid, lockCookie, userCompletion),
275 m_safe(0), m_unlockCompletion(0), m_writeRc(0) {
7c673cae
FG
276 if (userCompletion) {
277 m_safe = new librados::IoCtxImpl::C_aio_Complete(userCompletion);
278 }
279}
280
224ce89b 281WriteCompletionData::~WriteCompletionData() {
7c673cae
FG
282 m_unlockCompletion->release();
283 if (m_safe) delete m_safe;
284}
285
224ce89b 286void WriteCompletionData::complete_unlock(int r) {
7c673cae
FG
287 // call parent's completion method
288 // Note that we ignore the return code of the unlock as we cannot do much about it
289 CompletionData::complete(m_writeRc);
290}
291
224ce89b 292void WriteCompletionData::complete_write(int r) {
7c673cae
FG
293 // Remember return code
294 m_writeRc = r;
295}
296
224ce89b 297void WriteCompletionData::safe(int r) {
7c673cae
FG
298 if (m_safe) m_safe->finish(r);
299}
300
224ce89b
WB
301struct RemoveCompletionData : CompletionData {
302 /// removal flags
303 int flags;
9f95a23c
TL
304
305private:
306 FRIEND_MAKE_REF(RemoveCompletionData);
224ce89b
WB
307 /**
308 * constructor
309 * note that the constructed object will take ownership of the lock
310 */
311 RemoveCompletionData(libradosstriper::RadosStriperImpl * striper,
312 const std::string& soid,
313 const std::string& lockCookie,
314 librados::AioCompletionImpl *userCompletion,
315 int flags = 0) :
7c673cae 316 CompletionData(striper, soid, lockCookie, userCompletion), flags(flags) {}
224ce89b 317};
7c673cae 318
224ce89b
WB
319/**
320 * struct handling the data needed to pass to the call back
321 * function in asynchronous truncate operations
322 */
323struct TruncateCompletionData : RefCountedObject {
9f95a23c
TL
324 /// striper to be used
325 libradosstriper::RadosStriperImpl *m_striper;
326 /// striped object concerned by the truncate operation
327 std::string m_soid;
328 /// the final size of the truncated object
329 uint64_t m_size;
330
331private:
332 FRIEND_MAKE_REF(TruncateCompletionData);
224ce89b
WB
333 /// constructor
334 TruncateCompletionData(libradosstriper::RadosStriperImpl* striper,
335 const std::string& soid,
336 uint64_t size) :
337 RefCountedObject(striper->cct()),
338 m_striper(striper), m_soid(soid), m_size(size) {
339 m_striper->get();
340 }
341 /// destructor
342 ~TruncateCompletionData() override {
343 m_striper->put();
344 }
224ce89b 345};
7c673cae 346
224ce89b
WB
347/**
348 * struct handling the data needed to pass to the call back
349 * function in asynchronous read operations of a Rados File
350 */
351struct RadosReadCompletionData : RefCountedObject {
224ce89b
WB
352 /// the multi asynch io completion object to be used
353 MultiAioCompletionImplPtr m_multiAioCompl;
354 /// the expected number of bytes
355 uint64_t m_expectedBytes;
356 /// the bufferlist object where data have been written
357 bufferlist *m_bl;
9f95a23c
TL
358
359private:
360 FRIEND_MAKE_REF(RadosReadCompletionData);
361 /// constructor
362 RadosReadCompletionData(MultiAioCompletionImplPtr multiAioCompl,
363 uint64_t expectedBytes,
364 bufferlist *bl,
365 CephContext *context) :
366 RefCountedObject(context),
367 m_multiAioCompl(multiAioCompl), m_expectedBytes(expectedBytes), m_bl(bl) {}
224ce89b
WB
368};
369
370/**
371 * struct handling (most of) the data needed to pass to the call back
372 * function in asynchronous stat operations.
373 * Inherited by the actual type for adding time information in different
374 * versions (time_t or struct timespec)
375 */
376struct BasicStatCompletionData : CompletionData {
224ce89b
WB
377 // MultiAioCompletionImpl used to handle the double aysnc
378 // call in the back (stat + getxattr)
379 libradosstriper::MultiAioCompletionImpl *m_multiCompletion;
380 // where to store the size of first objct
381 // this will be ignored but we need a place to store it when
382 // async stat is called
383 uint64_t m_objectSize;
384 // where to store the file size
385 uint64_t *m_psize;
386 /// the bufferlist object used for the getxattr call
387 bufferlist m_bl;
388 /// return code of the stat
389 int m_statRC;
390 /// return code of the getxattr
391 int m_getxattrRC;
9f95a23c
TL
392
393protected:
394 /// constructor
395 BasicStatCompletionData(libradosstriper::RadosStriperImpl* striper,
396 const std::string& soid,
397 librados::AioCompletionImpl *userCompletion,
398 libradosstriper::MultiAioCompletionImpl *multiCompletion,
399 uint64_t *psize) :
400 CompletionData(striper, soid, "", userCompletion),
401 m_multiCompletion(multiCompletion), m_psize(psize),
402 m_statRC(0), m_getxattrRC(0) {};
403
224ce89b
WB
404};
405
406/**
407 * struct handling the data needed to pass to the call back
408 * function in asynchronous stat operations.
409 * Simple templated extension of BasicStatCompletionData.
410 * The template parameter is the type of the time information
411 * (used with time_t for stat and struct timespec for stat2)
412 */
413template<class TimeType>
414struct StatCompletionData : BasicStatCompletionData {
9f95a23c
TL
415 // where to store the file time
416 TimeType *m_pmtime;
417private:
418 FRIEND_MAKE_REF(StatCompletionData);
224ce89b 419 /// constructor
9f95a23c 420 StatCompletionData<TimeType>(libradosstriper::RadosStriperImpl* striper,
224ce89b
WB
421 const std::string& soid,
422 librados::AioCompletionImpl *userCompletion,
423 libradosstriper::MultiAioCompletionImpl *multiCompletion,
424 uint64_t *psize,
9f95a23c
TL
425 TimeType *pmtime) :
426 BasicStatCompletionData(striper, soid, userCompletion, multiCompletion, psize),
224ce89b 427 m_pmtime(pmtime) {};
224ce89b
WB
428};
429
430/**
431 * struct handling the data needed to pass to the call back
432 * function in asynchronous remove operations of a Rados File
433 */
434struct RadosRemoveCompletionData : RefCountedObject {
9f95a23c
TL
435 /// the multi asynch io completion object to be used
436 MultiAioCompletionImplPtr m_multiAioCompl;
437private:
438 FRIEND_MAKE_REF(RadosRemoveCompletionData);
224ce89b
WB
439 /// constructor
440 RadosRemoveCompletionData(MultiAioCompletionImplPtr multiAioCompl,
441 CephContext *context) :
9f95a23c 442 RefCountedObject(context),
224ce89b 443 m_multiAioCompl(multiAioCompl) {};
224ce89b
WB
444};
445
446
447} // namespace {
7c673cae
FG
448
449///////////////////////// constructor /////////////////////////////
450
451libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx& ioctx, librados::IoCtxImpl *ioctx_impl) :
9f95a23c 452 m_refCnt(0), m_radosCluster(ioctx), m_ioCtx(ioctx), m_ioCtxImpl(ioctx_impl),
7c673cae
FG
453 m_layout(default_file_layout) {}
454
455///////////////////////// layout /////////////////////////////
456
457int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit
458(unsigned int stripe_unit)
459{
460 /* stripe unit must be non-zero, 64k increment */
461 if (!stripe_unit || (stripe_unit & (CEPH_MIN_STRIPE_UNIT-1)))
462 return -EINVAL;
463 m_layout.fl_stripe_unit = stripe_unit;
464 return 0;
465}
466
467int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount
468(unsigned int stripe_count)
469{
470 /* stripe count must be non-zero */
471 if (!stripe_count)
472 return -EINVAL;
473 m_layout.fl_stripe_count = stripe_count;
474 return 0;
475}
476
477int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize
478(unsigned int object_size)
479{
480 /* object size must be non-zero, 64k increment */
481 if (!object_size || (object_size & (CEPH_MIN_STRIPE_UNIT-1)))
482 return -EINVAL;
483 /* object size must be a multiple of stripe unit */
484 if (object_size < m_layout.fl_stripe_unit ||
485 object_size % m_layout.fl_stripe_unit)
486 return -EINVAL;
487 m_layout.fl_object_size = object_size;
488 return 0;
489}
490
491///////////////////////// xattrs /////////////////////////////
492
493int libradosstriper::RadosStriperImpl::getxattr(const object_t& soid,
494 const char *name,
495 bufferlist& bl)
496{
497 std::string firstObjOid = getObjectId(soid, 0);
498 return m_ioCtx.getxattr(firstObjOid, name, bl);
499}
500
501int libradosstriper::RadosStriperImpl::setxattr(const object_t& soid,
502 const char *name,
503 bufferlist& bl)
504{
505 std::string firstObjOid = getObjectId(soid, 0);
506 return m_ioCtx.setxattr(firstObjOid, name, bl);
507}
508
509int libradosstriper::RadosStriperImpl::getxattrs(const object_t& soid,
510 map<string, bufferlist>& attrset)
511{
512 std::string firstObjOid = getObjectId(soid, 0);
513 int rc = m_ioCtx.getxattrs(firstObjOid, attrset);
514 if (rc) return rc;
515 // cleanup internal attributes dedicated to striping and locking
516 attrset.erase(XATTR_LAYOUT_STRIPE_UNIT);
517 attrset.erase(XATTR_LAYOUT_STRIPE_COUNT);
518 attrset.erase(XATTR_LAYOUT_OBJECT_SIZE);
519 attrset.erase(XATTR_SIZE);
520 attrset.erase(std::string(LOCK_PREFIX) + RADOS_LOCK_NAME);
521 return rc;
522}
523
524int libradosstriper::RadosStriperImpl::rmxattr(const object_t& soid,
525 const char *name)
526{
527 std::string firstObjOid = getObjectId(soid, 0);
528 return m_ioCtx.rmxattr(firstObjOid, name);
529}
530
531///////////////////////// io /////////////////////////////
532
533int libradosstriper::RadosStriperImpl::write(const std::string& soid,
534 const bufferlist& bl,
535 size_t len,
536 uint64_t off)
537{
538 // open the object. This will create it if needed, retrieve its layout
539 // and size and take a shared lock on it
540 ceph_file_layout layout;
541 std::string lockCookie;
542 int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true);
543 if (rc) return rc;
544 return write_in_open_object(soid, layout, lockCookie, bl, len, off);
545}
546
547int libradosstriper::RadosStriperImpl::append(const std::string& soid,
548 const bufferlist& bl,
549 size_t len)
550{
551 // open the object. This will create it if needed, retrieve its layout
552 // and size and take a shared lock on it
553 ceph_file_layout layout;
554 uint64_t size = len;
555 std::string lockCookie;
556 int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false);
557 if (rc) return rc;
558 return write_in_open_object(soid, layout, lockCookie, bl, len, size);
559}
560
561int libradosstriper::RadosStriperImpl::write_full(const std::string& soid,
562 const bufferlist& bl)
563{
564 int rc = trunc(soid, 0);
565 if (rc && rc != -ENOENT) return rc; // ENOENT is obviously ok
566 return write(soid, bl, bl.length(), 0);
567}
568
569int libradosstriper::RadosStriperImpl::read(const std::string& soid,
570 bufferlist* bl,
571 size_t len,
572 uint64_t off)
573{
574 // create a completion object
575 librados::AioCompletionImpl c;
576 // call asynchronous method
577 int rc = aio_read(soid, &c, bl, len, off);
578 // and wait for completion
579 if (!rc) {
580 // wait for completion
581 c.wait_for_complete_and_cb();
582 // return result
583 rc = c.get_return_value();
584 }
585 return rc;
586}
587
588///////////////////////// asynchronous io /////////////////////////////
589
590int libradosstriper::RadosStriperImpl::aio_write(const std::string& soid,
591 librados::AioCompletionImpl *c,
592 const bufferlist& bl,
593 size_t len,
594 uint64_t off)
595{
596 ceph_file_layout layout;
597 std::string lockCookie;
598 int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true);
599 if (rc) return rc;
600 return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, off);
601}
602
603int libradosstriper::RadosStriperImpl::aio_append(const std::string& soid,
604 librados::AioCompletionImpl *c,
605 const bufferlist& bl,
606 size_t len)
607{
608 ceph_file_layout layout;
609 uint64_t size = len;
610 std::string lockCookie;
611 int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false);
612 if (rc) return rc;
613 // create a completion object
614 return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, size);
615}
616
617int libradosstriper::RadosStriperImpl::aio_write_full(const std::string& soid,
618 librados::AioCompletionImpl *c,
619 const bufferlist& bl)
620{
621 int rc = trunc(soid, 0);
622 if (rc) return rc;
623 return aio_write(soid, c, bl, bl.length(), 0);
624}
625
626static void rados_read_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg)
627{
9f95a23c 628 auto cdata = ceph::ref_t<ReadCompletionData>(static_cast<ReadCompletionData*>(arg), false);
7c673cae
FG
629 libradosstriper::MultiAioCompletionImpl *comp =
630 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
631 cdata->complete_unlock(comp->rval);
7c673cae
FG
632}
633
634static void striper_read_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
635{
9f95a23c 636 auto cdata = static_cast<ReadCompletionData*>(arg);
7c673cae
FG
637 // launch the async unlocking of the object
638 cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion);
639 // complete the read part in parallel
640 libradosstriper::MultiAioCompletionImpl *comp =
641 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
642 cdata->complete_read(comp->rval);
643}
644
7c673cae
FG
645static void rados_req_read_complete(rados_completion_t c, void *arg)
646{
9f95a23c 647 auto data = static_cast<RadosReadCompletionData*>(arg);
7c673cae
FG
648 int rc = rados_aio_get_return_value(c);
649 // We need to handle the case of sparse files here
650 if (rc == -ENOENT) {
651 // the object did not exist at all. This can happen for sparse files.
652 // we consider we've read 0 bytes and it will fall into next case
653 rc = 0;
654 }
9f95a23c 655 ssize_t nread = rc;
7c673cae
FG
656 if (rc >= 0 && (((uint64_t)rc) < data->m_expectedBytes)) {
657 // only partial data were present in the object (or the object did not
658 // even exist if we've gone through previous case).
659 // This is typical of sparse file and we need to complete with 0s.
660 unsigned int lenOfZeros = data->m_expectedBytes-rc;
661 unsigned int existingDataToZero = min(data->m_bl->length()-rc, lenOfZeros);
662 if (existingDataToZero > 0) {
663 data->m_bl->zero(rc, existingDataToZero);
664 }
665 if (lenOfZeros > existingDataToZero) {
666 ceph::bufferptr zeros(ceph::buffer::create(lenOfZeros-existingDataToZero));
667 zeros.zero();
668 data->m_bl->push_back(zeros);
669 }
9f95a23c 670 nread = data->m_expectedBytes;
7c673cae 671 }
9f95a23c
TL
672 auto multi_aio_comp = data->m_multiAioCompl;
673 multi_aio_comp->complete_request(nread);
674 multi_aio_comp->safe_request(rc);
7c673cae
FG
675}
676
677int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
678 librados::AioCompletionImpl *c,
679 bufferlist* bl,
680 size_t len,
681 uint64_t off)
682{
683 // open the object. This will retrieve its layout and size
684 // and take a shared lock on it
685 ceph_file_layout layout;
686 uint64_t size;
687 std::string lockCookie;
688 int rc = openStripedObjectForRead(soid, &layout, &size, &lockCookie);
689 if (rc) return rc;
690 // find out the actual number of bytes we can read
691 uint64_t read_len;
692 if (off >= size) {
693 // nothing to read ! We are done.
694 read_len = 0;
695 } else {
696 read_len = min(len, (size_t)(size-off));
697 }
698 // get list of extents to be read from
699 vector<ObjectExtent> *extents = new vector<ObjectExtent>();
700 if (read_len > 0) {
c07f9fc5
FG
701 std::string format = soid;
702 boost::replace_all(format, "%", "%%");
703 format += RADOS_OBJECT_EXTENSION_FORMAT;
7c673cae
FG
704 file_layout_t l;
705 l.from_legacy(layout);
706 Striper::file_to_extents(cct(), format.c_str(), &l, off, read_len,
707 0, *extents);
708 }
709
710 // create a completion object and transfer ownership of extents and resultbl
711 vector<bufferlist> *resultbl = new vector<bufferlist>(extents->size());
9f95a23c 712 auto cdata = ceph::make_ref<ReadCompletionData>(this, soid, lockCookie, c, bl, extents, resultbl);
7c673cae
FG
713 c->is_read = true;
714 c->io = m_ioCtxImpl;
715 // create a completion for the unlocking of the striped object at the end of the read
716 librados::AioCompletion *unlock_completion =
9f95a23c 717 librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_read_aio_unlock_complete);
7c673cae
FG
718 cdata->m_unlockCompletion = unlock_completion;
719 // create the multiCompletion object handling the reads
224ce89b
WB
720 MultiAioCompletionImplPtr nc{new libradosstriper::MultiAioCompletionImpl,
721 false};
9f95a23c 722 nc->set_complete_callback(cdata.get(), striper_read_aio_req_complete);
7c673cae
FG
723 // go through the extents
724 int r = 0, i = 0;
725 for (vector<ObjectExtent>::iterator p = extents->begin(); p != extents->end(); ++p) {
726 // create a buffer list describing where to place data read from current extend
727 bufferlist *oid_bl = &((*resultbl)[i++]);
728 for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
729 q != p->buffer_extents.end();
730 ++q) {
731 bufferlist buffer_bl;
732 buffer_bl.substr_of(*bl, q->first, q->second);
733 oid_bl->append(buffer_bl);
734 }
735 // read all extends of a given object in one go
736 nc->add_request();
737 // we need 2 references on data as both rados_req_read_safe and rados_req_read_complete
738 // will release one
9f95a23c 739 auto data = ceph::make_ref<RadosReadCompletionData>(nc, p->length, oid_bl, cct());
7c673cae 740 librados::AioCompletion *rados_completion =
9f95a23c 741 librados::Rados::aio_create_completion(data.detach(), rados_req_read_complete);
7c673cae
FG
742 r = m_ioCtx.aio_read(p->oid.name, rados_completion, oid_bl, p->length, p->offset);
743 rados_completion->release();
744 if (r < 0)
745 break;
746 }
747 nc->finish_adding_requests();
7c673cae
FG
748 return r;
749}
750
751int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
752 librados::AioCompletionImpl *c,
753 char* buf,
754 size_t len,
755 uint64_t off)
756{
757 // create a buffer list and store it inside the completion object
758 c->bl.clear();
759 c->bl.push_back(buffer::create_static(len, buf));
760 // call the bufferlist version of this method
761 return aio_read(soid, c, &c->bl, len, off);
762}
763
764int libradosstriper::RadosStriperImpl::aio_flush()
765{
766 int ret;
767 // pass to the rados level
768 ret = m_ioCtx.aio_flush();
769 if (ret < 0)
770 return ret;
771 //wait all CompletionData are released
9f95a23c
TL
772 std::unique_lock l{lock};
773 cond.wait(l, [this] {return m_refCnt <= 1;});
7c673cae
FG
774 return ret;
775}
776
777///////////////////////// stat and deletion /////////////////////////////
778
779int libradosstriper::RadosStriperImpl::stat(const std::string& soid, uint64_t *psize, time_t *pmtime)
780{
781 // create a completion object
782 librados::AioCompletionImpl c;
783 // call asynchronous version of stat
784 int rc = aio_stat(soid, &c, psize, pmtime);
785 if (rc == 0) {
786 // wait for completion of the remove
787 c.wait_for_complete();
788 // get result
789 rc = c.get_return_value();
790 }
791 return rc;
792}
793
794static void striper_stat_aio_stat_complete(rados_completion_t c, void *arg) {
9f95a23c 795 auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false);
7c673cae
FG
796 int rc = rados_aio_get_return_value(c);
797 if (rc == -ENOENT) {
798 // remember this has failed
799 data->m_statRC = rc;
800 }
801 data->m_multiCompletion->complete_request(rc);
7c673cae
FG
802}
803
804static void striper_stat_aio_getxattr_complete(rados_completion_t c, void *arg) {
9f95a23c 805 auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false);
7c673cae
FG
806 int rc = rados_aio_get_return_value(c);
807 // We need to handle the case of sparse files here
808 if (rc < 0) {
809 // remember this has failed
810 data->m_getxattrRC = rc;
811 } else {
812 // this intermediate string allows to add a null terminator before calling strtol
813 std::string err;
814 std::string strsize(data->m_bl.c_str(), data->m_bl.length());
815 *data->m_psize = strict_strtoll(strsize.c_str(), 10, &err);
816 if (!err.empty()) {
817 lderr(data->m_striper->cct()) << XATTR_SIZE << " : " << err << dendl;
818 data->m_getxattrRC = -EINVAL;
819 }
820 rc = 0;
821 }
822 data->m_multiCompletion->complete_request(rc);
7c673cae
FG
823}
824
825static void striper_stat_aio_req_complete(rados_striper_multi_completion_t c,
826 void *arg) {
9f95a23c 827 auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false);
7c673cae
FG
828 if (data->m_statRC) {
829 data->complete(data->m_statRC);
830 } else {
831 if (data->m_getxattrRC < 0) {
832 data->complete(data->m_getxattrRC);
833 } else {
834 data->complete(0);
835 }
836 }
7c673cae
FG
837}
838
839template<class TimeType>
840int libradosstriper::RadosStriperImpl::aio_generic_stat
841(const std::string& soid,
842 librados::AioCompletionImpl *c,
843 uint64_t *psize,
844 TimeType *pmtime,
845 typename libradosstriper::RadosStriperImpl::StatFunction<TimeType>::Type statFunction)
846{
847 // use a MultiAioCompletion object for dealing with the fact
848 // that we'll do 2 asynchronous calls in parallel
224ce89b
WB
849 MultiAioCompletionImplPtr multi_completion{
850 new libradosstriper::MultiAioCompletionImpl, false};
7c673cae
FG
851 // Data object used for passing context to asynchronous calls
852 std::string firstObjOid = getObjectId(soid, 0);
9f95a23c
TL
853 auto cdata = ceph::make_ref<StatCompletionData<TimeType>>(this, firstObjOid, c, multi_completion.get(), psize, pmtime);
854 multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_stat_aio_req_complete);
7c673cae
FG
855 // use a regular AioCompletion for the stat async call
856 librados::AioCompletion *stat_completion =
9f95a23c 857 librados::Rados::aio_create_completion(cdata->get() /* create ref! */, striper_stat_aio_stat_complete);
7c673cae
FG
858 multi_completion->add_safe_request();
859 object_t obj(firstObjOid);
860 int rc = (m_ioCtxImpl->*statFunction)(obj, stat_completion->pc,
861 &cdata->m_objectSize, cdata->m_pmtime);
862 stat_completion->release();
863 if (rc < 0) {
864 // nothing is really started so cancel everything
9f95a23c 865 delete cdata.detach();
7c673cae
FG
866 return rc;
867 }
868 // use a regular AioCompletion for the getxattr async call
869 librados::AioCompletion *getxattr_completion =
9f95a23c 870 librados::Rados::aio_create_completion(cdata->get() /* create ref! */, striper_stat_aio_getxattr_complete);
7c673cae
FG
871 multi_completion->add_safe_request();
872 // in parallel, get the pmsize from the first object asynchronously
873 rc = m_ioCtxImpl->aio_getxattr(obj, getxattr_completion->pc,
874 XATTR_SIZE, cdata->m_bl);
875 getxattr_completion->release();
876 multi_completion->finish_adding_requests();
877 if (rc < 0) {
878 // the async stat is ongoing, so we need to go on
879 // we mark the getxattr as failed in the data object
880 cdata->m_getxattrRC = rc;
881 multi_completion->complete_request(rc);
7c673cae
FG
882 return rc;
883 }
7c673cae
FG
884 return 0;
885}
886
887int libradosstriper::RadosStriperImpl::aio_stat(const std::string& soid,
888 librados::AioCompletionImpl *c,
889 uint64_t *psize,
890 time_t *pmtime)
891{
892 return aio_generic_stat<time_t>(soid, c, psize, pmtime, &librados::IoCtxImpl::aio_stat);
893}
894
895int libradosstriper::RadosStriperImpl::stat2(const std::string& soid, uint64_t *psize, struct timespec *pts)
896{
897 // create a completion object
898 librados::AioCompletionImpl c;
899 // call asynchronous version of stat
900 int rc = aio_stat2(soid, &c, psize, pts);
901 if (rc == 0) {
902 // wait for completion of the remove
903 c.wait_for_complete_and_cb();
904 // get result
905 rc = c.get_return_value();
906 }
907 return rc;
908}
909
910int libradosstriper::RadosStriperImpl::aio_stat2(const std::string& soid,
911 librados::AioCompletionImpl *c,
912 uint64_t *psize,
913 struct timespec *pts)
914{
915 return aio_generic_stat<struct timespec>(soid, c, psize, pts, &librados::IoCtxImpl::aio_stat2);
916}
917
918static void rados_req_remove_complete(rados_completion_t c, void *arg)
919{
9f95a23c 920 auto cdata = static_cast<RadosRemoveCompletionData*>(arg);
7c673cae
FG
921 int rc = rados_aio_get_return_value(c);
922 // in case the object did not exist, it means we had a sparse file, all is fine
923 if (rc == -ENOENT) {
924 rc = 0;
925 }
926 cdata->m_multiAioCompl->complete_request(rc);
7c673cae 927 cdata->m_multiAioCompl->safe_request(rc);
7c673cae
FG
928}
929
930static void striper_remove_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
931{
9f95a23c 932 auto cdata = ceph::ref_t<RemoveCompletionData>(static_cast<RemoveCompletionData*>(arg), false);
7c673cae
FG
933 libradosstriper::MultiAioCompletionImpl *comp =
934 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
935 ldout(cdata->m_striper->cct(), 10)
936 << "RadosStriperImpl : striper_remove_aio_req_complete called for "
937 << cdata->m_soid << dendl;
938 int rc = comp->rval;
939 if (rc == 0) {
940 // All went fine, synchronously remove first object
941 rc = cdata->m_striper->m_ioCtx.remove(cdata->m_striper->getObjectId(cdata->m_soid, 0),
942 cdata->flags);
943 } else {
944 lderr(cdata->m_striper->cct())
945 << "RadosStriperImpl : deletion/truncation incomplete for " << cdata->m_soid
946 << ", as errors were encountered. The file is left present but it's content "
947 << " has been partially removed"
948 << dendl;
949 }
950 cdata->complete(rc);
7c673cae
FG
951}
952
953int libradosstriper::RadosStriperImpl::remove(const std::string& soid, int flags)
954{
955 // create a completion object
956 librados::AioCompletionImpl c;
957 // call asynchronous version of remove
958 int rc = aio_remove(soid, &c, flags);
959 if (rc == 0) {
960 // wait for completion of the remove
961 c.wait_for_complete_and_cb();
962 // get result
963 rc = c.get_return_value();
964 }
965 return rc;
966}
967
968int libradosstriper::RadosStriperImpl::aio_remove(const std::string& soid,
969 librados::AioCompletionImpl *c,
970 int flags)
971{
972 // the RemoveCompletionData object will lock the given soid for the duration
973 // of the removal
974 std::string lockCookie = getUUID();
975 int rc = m_ioCtx.lock_exclusive(getObjectId(soid, 0), RADOS_LOCK_NAME, lockCookie, "", 0, 0);
976 if (rc) return rc;
977 // create CompletionData for the async remove call
9f95a23c 978 auto cdata = ceph::make_ref<RemoveCompletionData>(this, soid, lockCookie, c, flags);
224ce89b
WB
979 MultiAioCompletionImplPtr multi_completion{
980 new libradosstriper::MultiAioCompletionImpl, false};
9f95a23c 981 multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_remove_aio_req_complete);
7c673cae
FG
982 // call asynchronous internal version of remove
983 ldout(cct(), 10)
984 << "RadosStriperImpl : Aio_remove starting for "
985 << soid << dendl;
986 rc = internal_aio_remove(soid, multi_completion);
7c673cae
FG
987 return rc;
988}
989
224ce89b
WB
990int libradosstriper::RadosStriperImpl::internal_aio_remove(
991 const std::string& soid,
992 MultiAioCompletionImplPtr multi_completion,
7c673cae
FG
993 int flags)
994{
995 std::string firstObjOid = getObjectId(soid, 0);
996 try {
997 // check size and get number of rados objects to delete
998 uint64_t nb_objects = 0;
999 bufferlist bl2;
1000 int rc = getxattr(soid, XATTR_SIZE, bl2);
1001 if (rc < 0) {
1002 // no object size (or not able to get it)
1003 // try to find the number of object "by hand"
1004 uint64_t psize;
1005 time_t pmtime;
1006 while (!m_ioCtx.stat(getObjectId(soid, nb_objects), &psize, &pmtime)) {
1007 nb_objects++;
1008 }
1009 } else {
1010 // count total number of rados objects in the striped object
1011 std::string err;
1012 // this intermediate string allows to add a null terminator before calling strtol
1013 std::string strsize(bl2.c_str(), bl2.length());
1014 uint64_t size = strict_strtoll(strsize.c_str(), 10, &err);
1015 if (!err.empty()) {
1016 lderr(cct()) << XATTR_SIZE << " : " << err << dendl;
1017
1018 return -EINVAL;
1019 }
1020 uint64_t object_size = m_layout.fl_object_size;
1021 uint64_t su = m_layout.fl_stripe_unit;
1022 uint64_t stripe_count = m_layout.fl_stripe_count;
1023 uint64_t nb_complete_sets = size / (object_size*stripe_count);
1024 uint64_t remaining_data = size % (object_size*stripe_count);
1025 uint64_t remaining_stripe_units = (remaining_data + su -1) / su;
1026 uint64_t remaining_objects = std::min(remaining_stripe_units, stripe_count);
1027 nb_objects = nb_complete_sets * stripe_count + remaining_objects;
1028 }
1029 // delete rados objects in reverse order
1030 // Note that we do not drop the first object. This one will only be dropped
1031 // if all other removals have been successful, and this is done in the
1032 // callback of the multi_completion object
1033 int rcr = 0;
1034 for (int i = nb_objects-1; i >= 1; i--) {
1035 multi_completion->add_request();
9f95a23c 1036 auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct());
7c673cae 1037 librados::AioCompletion *rados_completion =
9f95a23c
TL
1038 librados::Rados::aio_create_completion(data->get() /* create ref! */,
1039 rados_req_remove_complete);
7c673cae
FG
1040 if (flags == 0) {
1041 rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion);
1042 } else {
1043 rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion, flags);
1044 }
1045 rados_completion->release();
1046 if (rcr < 0 and -ENOENT != rcr) {
1047 lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid
1048 << ", as " << getObjectId(soid, i) << " could not be deleted (rc=" << rc << ")"
1049 << dendl;
1050 break;
1051 }
1052 }
1053 // we are over adding requests to the multi_completion object
1054 multi_completion->finish_adding_requests();
1055 // return
1056 return rcr;
1057 } catch (ErrorCode &e) {
11fdf7f2 1058 // error caught when trying to take the exclusive lock
7c673cae
FG
1059 return e.m_code;
1060 }
1061
1062}
1063
1064int libradosstriper::RadosStriperImpl::trunc(const std::string& soid, uint64_t size)
1065{
1066 // lock the object in exclusive mode
1067 std::string firstObjOid = getObjectId(soid, 0);
1068 librados::ObjectWriteOperation op;
1069 op.assert_exists();
1070 std::string lockCookie = RadosStriperImpl::getUUID();
1071 utime_t dur = utime_t();
f67539c2 1072 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::EXCLUSIVE, lockCookie, "", "", dur, 0);
7c673cae
FG
1073 int rc = m_ioCtx.operate(firstObjOid, &op);
1074 if (rc) return rc;
1075 // load layout and size
1076 ceph_file_layout layout;
1077 uint64_t original_size;
1078 rc = internal_get_layout_and_size(firstObjOid, &layout, &original_size);
1079 if (!rc) {
1080 if (size < original_size) {
1081 rc = truncate(soid, original_size, size, layout);
1082 } else if (size > original_size) {
1083 rc = grow(soid, original_size, size, layout);
1084 }
1085 }
1086 // unlock object, ignore return code as we cannot do much
1087 m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie);
1088 // final return
1089 return rc;
1090}
1091
1092
1093///////////////////////// private helpers /////////////////////////////
1094
1095std::string libradosstriper::RadosStriperImpl::getObjectId(const object_t& soid,
1096 long long unsigned objectno)
1097{
1098 std::ostringstream s;
1099 s << soid << '.' << std::setfill ('0') << std::setw(16) << std::hex << objectno;
1100 return s.str();
1101}
1102
1103void libradosstriper::RadosStriperImpl::unlockObject(const std::string& soid,
1104 const std::string& lockCookie)
1105{
1106 // unlock the shared lock on the first rados object
1107 std::string firstObjOid = getObjectId(soid, 0);
1108 m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie);
1109}
1110
1111void libradosstriper::RadosStriperImpl::aio_unlockObject(const std::string& soid,
1112 const std::string& lockCookie,
1113 librados::AioCompletion *c)
1114{
1115 // unlock the shared lock on the first rados object
1116 std::string firstObjOid = getObjectId(soid, 0);
1117 m_ioCtx.aio_unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie, c);
1118}
1119
1120static void rados_write_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg)
1121{
9f95a23c 1122 auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false);
7c673cae
FG
1123 libradosstriper::MultiAioCompletionImpl *comp =
1124 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1125 cdata->complete_unlock(comp->rval);
7c673cae
FG
1126}
1127
1128static void striper_write_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
1129{
9f95a23c 1130 auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false);
7c673cae
FG
1131 // launch the async unlocking of the object
1132 cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion);
1133 // complete the write part in parallel
1134 libradosstriper::MultiAioCompletionImpl *comp =
1135 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1136 cdata->complete_write(comp->rval);
7c673cae
FG
1137}
1138
1139static void striper_write_aio_req_safe(rados_striper_multi_completion_t c, void *arg)
1140{
9f95a23c 1141 auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false);
7c673cae
FG
1142 libradosstriper::MultiAioCompletionImpl *comp =
1143 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1144 cdata->safe(comp->rval);
7c673cae
FG
1145}
1146
1147int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string& soid,
1148 const ceph_file_layout& layout,
1149 const std::string& lockCookie,
1150 const bufferlist& bl,
1151 size_t len,
1152 uint64_t off) {
1153 // create a completion object to be passed to the callbacks of the multicompletion
1154 // we need 3 references as striper_write_aio_req_complete will release two and
1155 // striper_write_aio_req_safe will release one
9f95a23c 1156 auto cdata = ceph::make_ref<WriteCompletionData>(this, soid, lockCookie, nullptr);
7c673cae
FG
1157 // create a completion object for the unlocking of the striped object at the end of the write
1158 librados::AioCompletion *unlock_completion =
9f95a23c 1159 librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_write_aio_unlock_complete);
7c673cae
FG
1160 cdata->m_unlockCompletion = unlock_completion;
1161 // create the multicompletion that will handle the write completion
224ce89b
WB
1162 MultiAioCompletionImplPtr c{new libradosstriper::MultiAioCompletionImpl,
1163 false};
9f95a23c
TL
1164 c->set_complete_callback(cdata->get() /* create ref! */, striper_write_aio_req_complete);
1165 c->set_safe_callback(cdata->get() /* create ref! */, striper_write_aio_req_safe);
7c673cae
FG
1166 // call the asynchronous API
1167 int rc = internal_aio_write(soid, c, bl, len, off, layout);
1168 if (!rc) {
1169 // wait for completion and safety of data
1170 c->wait_for_complete_and_cb();
1171 c->wait_for_safe_and_cb();
1172 // wait for the unlocking
1173 unlock_completion->wait_for_complete();
1174 // return result
1175 rc = c->get_return_value();
1176 }
7c673cae
FG
1177 return rc;
1178}
1179
1180int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string& soid,
1181 librados::AioCompletionImpl *c,
1182 const ceph_file_layout& layout,
1183 const std::string& lockCookie,
1184 const bufferlist& bl,
1185 size_t len,
1186 uint64_t off) {
1187 // create a completion object to be passed to the callbacks of the multicompletion
1188 // we need 3 references as striper_write_aio_req_complete will release two and
1189 // striper_write_aio_req_safe will release one
9f95a23c 1190 auto cdata = ceph::make_ref<WriteCompletionData>(this, soid, lockCookie, c);
7c673cae
FG
1191 m_ioCtxImpl->get();
1192 c->io = m_ioCtxImpl;
1193 // create a completion object for the unlocking of the striped object at the end of the write
1194 librados::AioCompletion *unlock_completion =
9f95a23c 1195 librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_write_aio_unlock_complete);
7c673cae
FG
1196 cdata->m_unlockCompletion = unlock_completion;
1197 // create the multicompletion that will handle the write completion
224ce89b
WB
1198 libradosstriper::MultiAioCompletionImplPtr nc{
1199 new libradosstriper::MultiAioCompletionImpl, false};
9f95a23c
TL
1200 nc->set_complete_callback(cdata->get() /* create ref! */, striper_write_aio_req_complete);
1201 nc->set_safe_callback(cdata->get() /* create ref! */, striper_write_aio_req_safe);
7c673cae
FG
1202 // internal asynchronous API
1203 int rc = internal_aio_write(soid, nc, bl, len, off, layout);
7c673cae
FG
1204 return rc;
1205}
1206
7c673cae
FG
1207static void rados_req_write_complete(rados_completion_t c, void *arg)
1208{
9f95a23c 1209 auto comp = reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg);
7c673cae 1210 comp->complete_request(rados_aio_get_return_value(c));
9f95a23c 1211 comp->safe_request(rados_aio_get_return_value(c));
7c673cae
FG
1212}
1213
1214int
1215libradosstriper::RadosStriperImpl::internal_aio_write(const std::string& soid,
224ce89b 1216 libradosstriper::MultiAioCompletionImplPtr c,
7c673cae
FG
1217 const bufferlist& bl,
1218 size_t len,
1219 uint64_t off,
1220 const ceph_file_layout& layout)
1221{
1222 int r = 0;
1223 // Do not try anything if we are called with empty buffer,
1224 // file_to_extents would raise an exception
1225 if (len > 0) {
1226 // get list of extents to be written to
1227 vector<ObjectExtent> extents;
c07f9fc5
FG
1228 std::string format = soid;
1229 boost::replace_all(format, "%", "%%");
1230 format += RADOS_OBJECT_EXTENSION_FORMAT;
7c673cae
FG
1231 file_layout_t l;
1232 l.from_legacy(layout);
1233 Striper::file_to_extents(cct(), format.c_str(), &l, off, len, 0, extents);
1234 // go through the extents
1235 for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
1236 // assemble pieces of a given object into a single buffer list
1237 bufferlist oid_bl;
1238 for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
1239 q != p->buffer_extents.end();
1240 ++q) {
1241 bufferlist buffer_bl;
1242 buffer_bl.substr_of(bl, q->first, q->second);
1243 oid_bl.append(buffer_bl);
1244 }
1245 // and write the object
1246 c->add_request();
1247 librados::AioCompletion *rados_completion =
224ce89b 1248 librados::Rados::aio_create_completion(c.get(),
9f95a23c 1249 rados_req_write_complete);
224ce89b
WB
1250 r = m_ioCtx.aio_write(p->oid.name, rados_completion, oid_bl,
1251 p->length, p->offset);
7c673cae
FG
1252 rados_completion->release();
1253 if (r < 0)
1254 break;
1255 }
1256 }
1257 c->finish_adding_requests();
1258 return r;
1259}
1260
1261int libradosstriper::RadosStriperImpl::extract_uint32_attr
1262(std::map<std::string, bufferlist> &attrs,
1263 const std::string& key,
1264 ceph_le32 *value)
1265{
1266 std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key);
1267 if (attrsIt != attrs.end()) {
1268 // this intermediate string allows to add a null terminator before calling strtol
1269 std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length());
1270 std::string err;
1271 *value = strict_strtol(strvalue.c_str(), 10, &err);
1272 if (!err.empty()) {
1273 lderr(cct()) << key << " : " << err << dendl;
1274 return -EINVAL;
1275 }
1276 } else {
1277 return -ENOENT;
1278 }
1279 return 0;
1280}
1281
1282int libradosstriper::RadosStriperImpl::extract_sizet_attr
1283(std::map<std::string, bufferlist> &attrs,
1284 const std::string& key,
1285 size_t *value)
1286{
1287 std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key);
1288 if (attrsIt != attrs.end()) {
1289 // this intermediate string allows to add a null terminator before calling strtol
1290 std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length());
1291 std::string err;
1292 *value = strict_strtoll(strvalue.c_str(), 10, &err);
1293 if (!err.empty()) {
1294 lderr(cct()) << key << " : " << err << dendl;
1295 return -EINVAL;
1296 }
1297 } else {
1298 return -ENOENT;
1299 }
1300 return 0;
1301}
1302
1303int libradosstriper::RadosStriperImpl::internal_get_layout_and_size(
1304 const std::string& oid,
1305 ceph_file_layout *layout,
1306 uint64_t *size)
1307{
1308 // get external attributes of the first rados object
1309 std::map<std::string, bufferlist> attrs;
1310 int rc = m_ioCtx.getxattrs(oid, attrs);
1311 if (rc) return rc;
1312 // deal with stripe_unit
1313 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_UNIT, &layout->fl_stripe_unit);
1314 if (rc) return rc;
1315 // deal with stripe_count
1316 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_COUNT, &layout->fl_stripe_count);
1317 if (rc) return rc;
1318 // deal with object_size
1319 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_OBJECT_SIZE, &layout->fl_object_size);
1320 if (rc) return rc;
1321 // deal with size
1322 size_t ssize;
1323 rc = extract_sizet_attr(attrs, XATTR_SIZE, &ssize);
1324 if (rc) {
1325 return rc;
1326 }
1327 *size = ssize;
1328 // make valgrind happy by setting unused fl_pg_pool
1329 layout->fl_pg_pool = 0;
1330 return 0;
1331}
1332
1333int libradosstriper::RadosStriperImpl::openStripedObjectForRead(
1334 const std::string& soid,
1335 ceph_file_layout *layout,
1336 uint64_t *size,
1337 std::string *lockCookie)
1338{
1339 // take a lock the first rados object, if it exists and gets its size
1340 // check, lock and size reading must be atomic and are thus done within a single operation
1341 librados::ObjectWriteOperation op;
1342 op.assert_exists();
1343 *lockCookie = getUUID();
1344 utime_t dur = utime_t();
f67539c2 1345 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::SHARED, *lockCookie, "Tag", "", dur, 0);
7c673cae
FG
1346 std::string firstObjOid = getObjectId(soid, 0);
1347 int rc = m_ioCtx.operate(firstObjOid, &op);
1348 if (rc) {
1349 // error case (including -ENOENT)
1350 return rc;
1351 }
1352 rc = internal_get_layout_and_size(firstObjOid, layout, size);
1353 if (rc) {
1354 unlockObject(soid, *lockCookie);
1355 lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : "
1356 << "could not load layout and size for "
1357 << soid << " : rc = " << rc << dendl;
1358 }
1359 return rc;
1360}
1361
1362int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string& soid,
1363 ceph_file_layout *layout,
1364 uint64_t *size,
1365 std::string *lockCookie,
1366 bool isFileSizeAbsolute)
1367{
1368 // take a lock the first rados object, if it exists
1369 // check and lock must be atomic and are thus done within a single operation
1370 librados::ObjectWriteOperation op;
1371 op.assert_exists();
1372 *lockCookie = getUUID();
1373 utime_t dur = utime_t();
f67539c2 1374 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::SHARED, *lockCookie, "Tag", "", dur, 0);
7c673cae
FG
1375 std::string firstObjOid = getObjectId(soid, 0);
1376 int rc = m_ioCtx.operate(firstObjOid, &op);
1377 if (rc) {
1378 if (rc == -ENOENT) {
1379 // object does not exist, delegate to createEmptyStripedObject
1380 int rc = createAndOpenStripedObject(soid, layout, *size, lockCookie, isFileSizeAbsolute);
1381 // return original size
1382 *size = 0;
1383 return rc;
1384 } else {
1385 return rc;
1386 }
1387 }
1388 // all fine
1389 uint64_t curSize;
1390 rc = internal_get_layout_and_size(firstObjOid, layout, &curSize);
1391 if (rc) {
1392 unlockObject(soid, *lockCookie);
1393 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1394 << "could not load layout and size for "
1395 << soid << " : rc = " << rc << dendl;
1396 return rc;
1397 }
1398 // atomically update object size, only if smaller than current one
1399 if (!isFileSizeAbsolute)
1400 *size += curSize;
1401 librados::ObjectWriteOperation writeOp;
1402 writeOp.cmpxattr(XATTR_SIZE, LIBRADOS_CMPXATTR_OP_GT, *size);
1403 std::ostringstream oss;
1404 oss << *size;
1405 bufferlist bl;
1406 bl.append(oss.str());
1407 writeOp.setxattr(XATTR_SIZE, bl);
1408 rc = m_ioCtx.operate(firstObjOid, &writeOp);
1409 // return current size
1410 *size = curSize;
1411 // handle case where objectsize is already bigger than size
1412 if (-ECANCELED == rc)
1413 rc = 0;
1414 if (rc) {
1415 unlockObject(soid, *lockCookie);
1416 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1417 << "could not set new size for "
1418 << soid << " : rc = " << rc << dendl;
1419 }
1420 return rc;
1421}
1422
1423int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string& soid,
1424 ceph_file_layout *layout,
1425 uint64_t size,
1426 std::string *lockCookie,
1427 bool isFileSizeAbsolute)
1428{
1429 // build atomic write operation
1430 librados::ObjectWriteOperation writeOp;
1431 writeOp.create(true);
1432 // object_size
1433 std::ostringstream oss_object_size;
1434 oss_object_size << m_layout.fl_object_size;
1435 bufferlist bl_object_size;
1436 bl_object_size.append(oss_object_size.str());
1437 writeOp.setxattr(XATTR_LAYOUT_OBJECT_SIZE, bl_object_size);
1438 // stripe unit
1439 std::ostringstream oss_stripe_unit;
1440 oss_stripe_unit << m_layout.fl_stripe_unit;
1441 bufferlist bl_stripe_unit;
1442 bl_stripe_unit.append(oss_stripe_unit.str());
1443 writeOp.setxattr(XATTR_LAYOUT_STRIPE_UNIT, bl_stripe_unit);
1444 // stripe count
1445 std::ostringstream oss_stripe_count;
1446 oss_stripe_count << m_layout.fl_stripe_count;
1447 bufferlist bl_stripe_count;
1448 bl_stripe_count.append(oss_stripe_count.str());
1449 writeOp.setxattr(XATTR_LAYOUT_STRIPE_COUNT, bl_stripe_count);
1450 // size
1451 std::ostringstream oss_size;
1452 oss_size << (isFileSizeAbsolute?size:0);
1453 bufferlist bl_size;
1454 bl_size.append(oss_size.str());
1455 writeOp.setxattr(XATTR_SIZE, bl_size);
1456 // effectively change attributes
1457 std::string firstObjOid = getObjectId(soid, 0);
1458 int rc = m_ioCtx.operate(firstObjOid, &writeOp);
1459 // in case of error (but no EEXIST which would mean the object existed), return
1460 if (rc && -EEXIST != rc) return rc;
1461 // Otherwise open the object
1462 uint64_t fileSize = size;
1463 return openStripedObjectForWrite(soid, layout, &fileSize, lockCookie, isFileSizeAbsolute);
1464}
1465
1466static void striper_truncate_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
1467{
9f95a23c 1468 auto cdata = ceph::ref_t<TruncateCompletionData>(static_cast<TruncateCompletionData*>(arg), false);
7c673cae
FG
1469 libradosstriper::MultiAioCompletionImpl *comp =
1470 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1471 if (0 == comp->rval) {
1472 // all went fine, change size in the external attributes
1473 std::ostringstream oss;
1474 oss << cdata->m_size;
1475 bufferlist bl;
1476 bl.append(oss.str());
1477 cdata->m_striper->setxattr(cdata->m_soid, XATTR_SIZE, bl);
1478 }
7c673cae
FG
1479}
1480
1481int libradosstriper::RadosStriperImpl::truncate(const std::string& soid,
1482 uint64_t original_size,
1483 uint64_t size,
1484 ceph_file_layout &layout)
1485{
9f95a23c 1486 auto cdata = ceph::make_ref<TruncateCompletionData>(this, soid, size);
224ce89b
WB
1487 libradosstriper::MultiAioCompletionImplPtr multi_completion{
1488 new libradosstriper::MultiAioCompletionImpl, false};
9f95a23c 1489 multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_truncate_aio_req_complete);
7c673cae
FG
1490 // call asynchrous version of truncate
1491 int rc = aio_truncate(soid, multi_completion, original_size, size, layout);
1492 // wait for completion of the truncation
1493 multi_completion->finish_adding_requests();
1494 multi_completion->wait_for_complete_and_cb();
1495 // return result
1496 if (rc == 0) {
1497 rc = multi_completion->get_return_value();
1498 }
7c673cae
FG
1499 return rc;
1500}
1501
1502int libradosstriper::RadosStriperImpl::aio_truncate
1503(const std::string& soid,
224ce89b 1504 libradosstriper::MultiAioCompletionImplPtr multi_completion,
7c673cae
FG
1505 uint64_t original_size,
1506 uint64_t size,
1507 ceph_file_layout &layout)
1508{
1509 // handle the underlying rados objects. 3 cases here :
1510 // -- the objects belonging to object sets entirely located
1511 // before the truncation are unchanged
1512 // -- the objects belonging to the object set where the
1513 // truncation took place are truncated or removed
1514 // -- the objects belonging to object sets entirely located
1515 // after the truncation are removed
1516 // Note that we do it backward and that we change the size in
1517 // the external attributes only at the end. This make sure that
1518 // no rados object stays behind if we remove the striped object
1519 // after a truncation has failed
1520 uint64_t trunc_objectsetno = size / layout.fl_object_size / layout.fl_stripe_count;
1521 uint64_t last_objectsetno = original_size / layout.fl_object_size / layout.fl_stripe_count;
1522 bool exists = false;
1523 for (int64_t objectno = (last_objectsetno+1) * layout.fl_stripe_count-1;
1524 objectno >= (int64_t)((trunc_objectsetno + 1) * layout.fl_stripe_count);
1525 objectno--) {
1526 // if no object existed so far, check object existence
1527 if (!exists) {
1528 uint64_t nb_full_object_set = objectno / layout.fl_stripe_count;
1529 uint64_t object_index_in_set = objectno % layout.fl_stripe_count;
1530 uint64_t set_start_off = nb_full_object_set * layout.fl_object_size * layout.fl_stripe_count;
1531 uint64_t object_start_off = set_start_off + object_index_in_set * layout.fl_stripe_unit;
1532 exists = (original_size > object_start_off);
1533 }
1534 if (exists) {
1535 // remove asynchronously
1536 multi_completion->add_request();
9f95a23c 1537 auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct());
7c673cae 1538 librados::AioCompletion *rados_completion =
9f95a23c
TL
1539 librados::Rados::aio_create_completion(data->get() /* create ref! */,
1540 rados_req_remove_complete);
7c673cae
FG
1541 int rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion);
1542 rados_completion->release();
1543 // in case the object did not exist, it means we had a sparse file, all is fine
1544 if (rc && rc != -ENOENT) return rc;
1545 }
1546 }
1547 for (int64_t objectno = ((trunc_objectsetno + 1) * layout.fl_stripe_count) -1;
1548 objectno >= (int64_t)(trunc_objectsetno * layout.fl_stripe_count);
1549 objectno--) {
1550 // if no object existed so far, check object existence
1551 if (!exists) {
1552 uint64_t object_start_off = ((objectno / layout.fl_stripe_count) * layout.fl_object_size) +
1553 ((objectno % layout.fl_stripe_count) * layout.fl_stripe_unit);
1554 exists = (original_size > object_start_off);
1555 }
1556 if (exists) {
1557 // truncate
1558 file_layout_t l;
1559 l.from_legacy(layout);
1560 uint64_t new_object_size = Striper::object_truncate_size(cct(), &l, objectno, size);
1561 int rc;
1562 if (new_object_size > 0 or 0 == objectno) {
1563 // trunc is synchronous as there is no async version
1564 // but note that only a single object will be truncated
1565 // reducing the overload to a fixed amount
1566 rc = m_ioCtx.trunc(getObjectId(soid, objectno), new_object_size);
1567 } else {
1568 // removes are asynchronous in order to speed up truncations of big files
1569 multi_completion->add_request();
9f95a23c 1570 auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct());
7c673cae 1571 librados::AioCompletion *rados_completion =
9f95a23c
TL
1572 librados::Rados::aio_create_completion(data->get() /* create ref! */,
1573 rados_req_remove_complete);
7c673cae
FG
1574 rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion);
1575 rados_completion->release();
1576 }
1577 // in case the object did not exist, it means we had a sparse file, all is fine
1578 if (rc && rc != -ENOENT) return rc;
1579 }
1580 }
1581 return 0;
1582}
1583
1584int libradosstriper::RadosStriperImpl::grow(const std::string& soid,
1585 uint64_t original_size,
1586 uint64_t size,
1587 ceph_file_layout &layout)
1588{
1589 // handle the underlying rados objects. As we support sparse objects,
1590 // we only have to change the size in the external attributes
1591 std::ostringstream oss;
1592 oss << size;
1593 bufferlist bl;
1594 bl.append(oss.str());
1595 int rc = m_ioCtx.setxattr(getObjectId(soid, 0), XATTR_SIZE, bl);
1596 return rc;
1597}
1598
1599std::string libradosstriper::RadosStriperImpl::getUUID()
1600{
1601 struct uuid_d uuid;
1602 uuid.generate_random();
1603 char suuid[37];
1604 uuid.print(suuid);
1605 return std::string(suuid);
1606}