]> git.proxmox.com Git - ceph.git/blame - ceph/src/libradosstriper/RadosStriperImpl.cc
update sources to v12.1.1
[ceph.git] / ceph / src / libradosstriper / RadosStriperImpl.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "libradosstriper/RadosStriperImpl.h"
16
17#include <errno.h>
18
19#include <sstream>
20#include <iomanip>
21#include <algorithm>
22
23#include "include/types.h"
24#include "include/uuid.h"
25#include "include/ceph_fs.h"
26#include "common/dout.h"
27#include "common/strtol.h"
28#include "osdc/Striper.h"
7c673cae
FG
29#include "librados/AioCompletionImpl.h"
30#include <cls/lock/cls_lock_client.h>
31
32/*
33 * This file contents the actual implementation of the rados striped objects interface.
34 *
35 * Striped objects are stored in rados in a set of regular rados objects, after their
36 * content has been striped using the osdc/Striper interface.
37 *
38 * The external attributes of the striped object are mapped to the attributes of the
39 * first underlying object. This first object has a set of extra external attributes
40 * storing the layout of the striped object for future read back. These attributes are :
41 * - striper.layout.object_size : the size of rados objects used.
42 * Must be a multiple of striper.layout.stripe_unit
43 * - striper.layout.stripe_unit : the size of a stripe unit
44 * - striper.layout.stripe_count : the number of stripes used
45 * - striper.size : total striped object size
46 *
47 * In general operations on striped objects are not atomic.
48 * However, a certain number of safety guards have been put to make the interface closer
49 * to atomicity :
50 * - each data operation takes a shared lock on the first rados object for the
51 * whole time of the operation
52 * - the remove and trunc operations take an exclusive lock on the first rados object
53 * for the whole time of the operation
54 * This makes sure that no removal/truncation of a striped object occurs while
55 * data operations are happening and vice versa. It thus makes sure that the layout
56 * of a striped object does not change during data operation, which is essential for
57 * data consistency.
58 *
59 * Still the writing to a striped object is not atomic. This means in particular that
60 * the size of an object may not be in sync with its content at all times.
61 * As the size is always garanteed to be updated first and in an atomic way, and as
62 * sparse striped objects are supported (see below), what will typically happen is
63 * that a reader that comes too soon after a write will read 0s instead of the actual
64 * data.
65 *
66 * Note that remove handles the pieces of the striped object in reverse order,
67 * so that the head object is removed last, making the completion of the deletion atomic.
68 *
69 * Striped objects can be sparse, typically in case data was written at the end of the
70 * striped object only. In such a case, some rados objects constituing the striped object
71 * may be missing. Other can be partial (only the beginning will have data)
72 * When dealing with such sparse striped files, missing objects are detected and
73 * considered as full of 0s. They are however not created until real data is written
74 * to them.
75 *
76 * There are a number of missing features/improvements that could be implemented.
77 * Here are some ideas :
78 * - implementation of missing entry points (compared to rados)
79 * In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ...
80 *
81 */
82
83#define dout_subsys ceph_subsys_rados
84#undef dout_prefix
85#define dout_prefix *_dout << "libradosstriper: "
86
87/// size of xattr buffer
88#define XATTR_BUFFER_SIZE 32
89
90/// names of the different xattr entries
91#define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit"
92#define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count"
93#define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size"
94#define XATTR_SIZE "striper.size"
95#define LOCK_PREFIX "lock."
96
97/// name of the lock used on objects to ensure layout stability during IO
98#define RADOS_LOCK_NAME "striper.lock"
99
100/// format of the extension of rados objects created for a given striped object
101#define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx"
102
103/// default object layout
104struct ceph_file_layout default_file_layout = {
105 init_le32(1<<22), // fl_stripe_unit
106 init_le32(1), // fl_stripe_count
107 init_le32(1<<22), // fl_object_size
108 init_le32(0), // fl_cas_hash
109 init_le32(0), // fl_object_stripe_unit
110 init_le32(-1), // fl_unused
111 init_le32(-1), // fl_pg_pool
112};
113
224ce89b
WB
114using libradosstriper::MultiAioCompletionImplPtr;
115
116namespace {
7c673cae
FG
117
118///////////////////////// CompletionData /////////////////////////////
119
224ce89b
WB
120/**
121 * struct handling the data needed to pass to the call back
122 * function in asynchronous operations
123 */
124struct CompletionData : RefCountedObject {
125 /// constructor
126 CompletionData(libradosstriper::RadosStriperImpl * striper,
127 const std::string& soid,
128 const std::string& lockCookie,
129 librados::AioCompletionImpl *userCompletion = 0,
130 int n = 1);
131 /// destructor
132 ~CompletionData() override;
133 /// complete method
134 void complete(int r);
135 /// striper to be used to handle the write completion
136 libradosstriper::RadosStriperImpl *m_striper;
137 /// striped object concerned by the write operation
138 std::string m_soid;
139 /// shared lock to be released at completion
140 std::string m_lockCookie;
141 /// completion handler
142 librados::IoCtxImpl::C_aio_Complete *m_ack;
143};
144
145CompletionData::CompletionData
7c673cae
FG
146(libradosstriper::RadosStriperImpl* striper,
147 const std::string& soid,
148 const std::string& lockCookie,
149 librados::AioCompletionImpl *userCompletion,
150 int n) :
151 RefCountedObject(striper->cct(), n),
152 m_striper(striper), m_soid(soid), m_lockCookie(lockCookie), m_ack(0) {
153 m_striper->get();
154 if (userCompletion) {
155 m_ack = new librados::IoCtxImpl::C_aio_Complete(userCompletion);
156 userCompletion->io = striper->m_ioCtxImpl;
157 }
158}
159
224ce89b 160CompletionData::~CompletionData() {
7c673cae
FG
161 if (m_ack) delete m_ack;
162 m_striper->put();
163}
164
224ce89b 165void CompletionData::complete(int r) {
7c673cae
FG
166 if (m_ack) m_ack->finish(r);
167}
168
224ce89b
WB
169/**
170 * struct handling the data needed to pass to the call back
171 * function in asynchronous read operations
172 */
173struct ReadCompletionData : CompletionData {
174 /// bufferlist containing final result
175 bufferlist* m_bl;
176 /// extents that will be read
177 std::vector<ObjectExtent>* m_extents;
178 /// intermediate results
179 std::vector<bufferlist>* m_resultbl;
180 /// return code of read completion, to be remembered until unlocking happened
181 int m_readRc;
182 /// completion object for the unlocking of the striped object at the end of the read
183 librados::AioCompletion *m_unlockCompletion;
184 /// constructor
185 ReadCompletionData(libradosstriper::RadosStriperImpl * striper,
186 const std::string& soid,
187 const std::string& lockCookie,
188 librados::AioCompletionImpl *userCompletion,
189 bufferlist* bl,
190 std::vector<ObjectExtent>* extents,
191 std::vector<bufferlist>* resultbl,
192 int n);
193 /// destructor
194 ~ReadCompletionData() override;
195 /// complete method for when reading is over
196 void complete_read(int r);
197 /// complete method for when object is unlocked
198 void complete_unlock(int r);
199};
200
201ReadCompletionData::ReadCompletionData
7c673cae
FG
202(libradosstriper::RadosStriperImpl* striper,
203 const std::string& soid,
204 const std::string& lockCookie,
205 librados::AioCompletionImpl *userCompletion,
206 bufferlist* bl,
207 std::vector<ObjectExtent>* extents,
208 std::vector<bufferlist>* resultbl,
209 int n) :
210 CompletionData(striper, soid, lockCookie, userCompletion, n),
211 m_bl(bl), m_extents(extents), m_resultbl(resultbl), m_readRc(0),
212 m_unlockCompletion(0) {}
213
224ce89b 214ReadCompletionData::~ReadCompletionData() {
7c673cae
FG
215 m_unlockCompletion->release();
216 delete m_extents;
217 delete m_resultbl;
218}
219
224ce89b 220void ReadCompletionData::complete_read(int r) {
7c673cae
FG
221 // gather data into final buffer
222 Striper::StripedReadResult readResult;
223 vector<bufferlist>::iterator bit = m_resultbl->begin();
224 for (vector<ObjectExtent>::iterator eit = m_extents->begin();
225 eit != m_extents->end();
226 ++eit, ++bit) {
227 readResult.add_partial_result(m_striper->cct(), *bit, eit->buffer_extents);
228 }
229 m_bl->clear();
230 readResult.assemble_result(m_striper->cct(), *m_bl, true);
231 // Remember return code
232 m_readRc = r;
233}
234
224ce89b 235void ReadCompletionData::complete_unlock(int r) {
7c673cae
FG
236 // call parent's completion method
237 // Note that we ignore the return code of the unlock as we cannot do much about it
238 CompletionData::complete(m_readRc?m_readRc:m_bl->length());
239}
240
224ce89b
WB
241/**
242 * struct handling the data needed to pass to the call back
243 * function in asynchronous write operations
244 */
245struct WriteCompletionData : CompletionData {
246 /// safe completion handler
247 librados::IoCtxImpl::C_aio_Complete *m_safe;
248 /// return code of write completion, to be remembered until unlocking happened
249 int m_writeRc;
250 /// completion object for the unlocking of the striped object at the end of the write
251 librados::AioCompletion *m_unlockCompletion;
252 /// constructor
253 WriteCompletionData(libradosstriper::RadosStriperImpl * striper,
254 const std::string& soid,
255 const std::string& lockCookie,
256 librados::AioCompletionImpl *userCompletion,
257 int n);
258 /// destructor
259 ~WriteCompletionData() override;
260 /// complete method for when writing is over
261 void complete_write(int r);
262 /// complete method for when object is unlocked
263 void complete_unlock(int r);
264 /// safe method
265 void safe(int r);
266};
267
268WriteCompletionData::WriteCompletionData
7c673cae
FG
269(libradosstriper::RadosStriperImpl* striper,
270 const std::string& soid,
271 const std::string& lockCookie,
272 librados::AioCompletionImpl *userCompletion,
273 int n) :
274 CompletionData(striper, soid, lockCookie, userCompletion, n), m_safe(0),
275 m_unlockCompletion(0) {
276 if (userCompletion) {
277 m_safe = new librados::IoCtxImpl::C_aio_Complete(userCompletion);
278 }
279}
280
224ce89b 281WriteCompletionData::~WriteCompletionData() {
7c673cae
FG
282 m_unlockCompletion->release();
283 if (m_safe) delete m_safe;
284}
285
224ce89b 286void WriteCompletionData::complete_unlock(int r) {
7c673cae
FG
287 // call parent's completion method
288 // Note that we ignore the return code of the unlock as we cannot do much about it
289 CompletionData::complete(m_writeRc);
290}
291
224ce89b 292void WriteCompletionData::complete_write(int r) {
7c673cae
FG
293 // Remember return code
294 m_writeRc = r;
295}
296
224ce89b 297void WriteCompletionData::safe(int r) {
7c673cae
FG
298 if (m_safe) m_safe->finish(r);
299}
300
224ce89b
WB
301struct RemoveCompletionData : CompletionData {
302 /// removal flags
303 int flags;
304 /**
305 * constructor
306 * note that the constructed object will take ownership of the lock
307 */
308 RemoveCompletionData(libradosstriper::RadosStriperImpl * striper,
309 const std::string& soid,
310 const std::string& lockCookie,
311 librados::AioCompletionImpl *userCompletion,
312 int flags = 0) :
7c673cae 313 CompletionData(striper, soid, lockCookie, userCompletion), flags(flags) {}
224ce89b 314};
7c673cae 315
224ce89b
WB
316/**
317 * struct handling the data needed to pass to the call back
318 * function in asynchronous truncate operations
319 */
320struct TruncateCompletionData : RefCountedObject {
321 /// constructor
322 TruncateCompletionData(libradosstriper::RadosStriperImpl* striper,
323 const std::string& soid,
324 uint64_t size) :
325 RefCountedObject(striper->cct()),
326 m_striper(striper), m_soid(soid), m_size(size) {
327 m_striper->get();
328 }
329 /// destructor
330 ~TruncateCompletionData() override {
331 m_striper->put();
332 }
333 /// striper to be used
334 libradosstriper::RadosStriperImpl *m_striper;
335 /// striped object concerned by the truncate operation
336 std::string m_soid;
337 /// the final size of the truncated object
338 uint64_t m_size;
339};
7c673cae 340
224ce89b
WB
341/**
342 * struct handling the data needed to pass to the call back
343 * function in asynchronous read operations of a Rados File
344 */
345struct RadosReadCompletionData : RefCountedObject {
346 /// constructor
347 RadosReadCompletionData(MultiAioCompletionImplPtr multiAioCompl,
348 uint64_t expectedBytes,
349 bufferlist *bl,
350 CephContext *context,
351 int n = 1) :
352 RefCountedObject(context, n),
353 m_multiAioCompl(multiAioCompl), m_expectedBytes(expectedBytes), m_bl(bl) {}
354 /// the multi asynch io completion object to be used
355 MultiAioCompletionImplPtr m_multiAioCompl;
356 /// the expected number of bytes
357 uint64_t m_expectedBytes;
358 /// the bufferlist object where data have been written
359 bufferlist *m_bl;
360};
361
362/**
363 * struct handling (most of) the data needed to pass to the call back
364 * function in asynchronous stat operations.
365 * Inherited by the actual type for adding time information in different
366 * versions (time_t or struct timespec)
367 */
368struct BasicStatCompletionData : CompletionData {
369 /// constructor
370 BasicStatCompletionData(libradosstriper::RadosStriperImpl* striper,
371 const std::string& soid,
372 librados::AioCompletionImpl *userCompletion,
373 libradosstriper::MultiAioCompletionImpl *multiCompletion,
374 uint64_t *psize,
375 int n = 1) :
376 CompletionData(striper, soid, "", userCompletion, n),
377 m_multiCompletion(multiCompletion), m_psize(psize),
378 m_statRC(0), m_getxattrRC(0) {};
379 // MultiAioCompletionImpl used to handle the double aysnc
380 // call in the back (stat + getxattr)
381 libradosstriper::MultiAioCompletionImpl *m_multiCompletion;
382 // where to store the size of first objct
383 // this will be ignored but we need a place to store it when
384 // async stat is called
385 uint64_t m_objectSize;
386 // where to store the file size
387 uint64_t *m_psize;
388 /// the bufferlist object used for the getxattr call
389 bufferlist m_bl;
390 /// return code of the stat
391 int m_statRC;
392 /// return code of the getxattr
393 int m_getxattrRC;
394};
395
396/**
397 * struct handling the data needed to pass to the call back
398 * function in asynchronous stat operations.
399 * Simple templated extension of BasicStatCompletionData.
400 * The template parameter is the type of the time information
401 * (used with time_t for stat and struct timespec for stat2)
402 */
403template<class TimeType>
404struct StatCompletionData : BasicStatCompletionData {
405 /// constructor
406 StatCompletionData(libradosstriper::RadosStriperImpl* striper,
407 const std::string& soid,
408 librados::AioCompletionImpl *userCompletion,
409 libradosstriper::MultiAioCompletionImpl *multiCompletion,
410 uint64_t *psize,
411 TimeType *pmtime,
412 int n = 1) :
413 BasicStatCompletionData(striper, soid, userCompletion, multiCompletion, psize, n),
414 m_pmtime(pmtime) {};
415 // where to store the file time
416 TimeType *m_pmtime;
417};
418
419/**
420 * struct handling the data needed to pass to the call back
421 * function in asynchronous remove operations of a Rados File
422 */
423struct RadosRemoveCompletionData : RefCountedObject {
424 /// constructor
425 RadosRemoveCompletionData(MultiAioCompletionImplPtr multiAioCompl,
426 CephContext *context) :
427 RefCountedObject(context, 2),
428 m_multiAioCompl(multiAioCompl) {};
429 /// the multi asynch io completion object to be used
430 MultiAioCompletionImplPtr m_multiAioCompl;
431};
432
433
434} // namespace {
7c673cae
FG
435
436///////////////////////// constructor /////////////////////////////
437
438libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx& ioctx, librados::IoCtxImpl *ioctx_impl) :
439 m_refCnt(0),lock("RadosStriper Refcont", false, false), m_radosCluster(ioctx), m_ioCtx(ioctx), m_ioCtxImpl(ioctx_impl),
440 m_layout(default_file_layout) {}
441
442///////////////////////// layout /////////////////////////////
443
444int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit
445(unsigned int stripe_unit)
446{
447 /* stripe unit must be non-zero, 64k increment */
448 if (!stripe_unit || (stripe_unit & (CEPH_MIN_STRIPE_UNIT-1)))
449 return -EINVAL;
450 m_layout.fl_stripe_unit = stripe_unit;
451 return 0;
452}
453
454int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount
455(unsigned int stripe_count)
456{
457 /* stripe count must be non-zero */
458 if (!stripe_count)
459 return -EINVAL;
460 m_layout.fl_stripe_count = stripe_count;
461 return 0;
462}
463
464int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize
465(unsigned int object_size)
466{
467 /* object size must be non-zero, 64k increment */
468 if (!object_size || (object_size & (CEPH_MIN_STRIPE_UNIT-1)))
469 return -EINVAL;
470 /* object size must be a multiple of stripe unit */
471 if (object_size < m_layout.fl_stripe_unit ||
472 object_size % m_layout.fl_stripe_unit)
473 return -EINVAL;
474 m_layout.fl_object_size = object_size;
475 return 0;
476}
477
478///////////////////////// xattrs /////////////////////////////
479
480int libradosstriper::RadosStriperImpl::getxattr(const object_t& soid,
481 const char *name,
482 bufferlist& bl)
483{
484 std::string firstObjOid = getObjectId(soid, 0);
485 return m_ioCtx.getxattr(firstObjOid, name, bl);
486}
487
488int libradosstriper::RadosStriperImpl::setxattr(const object_t& soid,
489 const char *name,
490 bufferlist& bl)
491{
492 std::string firstObjOid = getObjectId(soid, 0);
493 return m_ioCtx.setxattr(firstObjOid, name, bl);
494}
495
496int libradosstriper::RadosStriperImpl::getxattrs(const object_t& soid,
497 map<string, bufferlist>& attrset)
498{
499 std::string firstObjOid = getObjectId(soid, 0);
500 int rc = m_ioCtx.getxattrs(firstObjOid, attrset);
501 if (rc) return rc;
502 // cleanup internal attributes dedicated to striping and locking
503 attrset.erase(XATTR_LAYOUT_STRIPE_UNIT);
504 attrset.erase(XATTR_LAYOUT_STRIPE_COUNT);
505 attrset.erase(XATTR_LAYOUT_OBJECT_SIZE);
506 attrset.erase(XATTR_SIZE);
507 attrset.erase(std::string(LOCK_PREFIX) + RADOS_LOCK_NAME);
508 return rc;
509}
510
511int libradosstriper::RadosStriperImpl::rmxattr(const object_t& soid,
512 const char *name)
513{
514 std::string firstObjOid = getObjectId(soid, 0);
515 return m_ioCtx.rmxattr(firstObjOid, name);
516}
517
518///////////////////////// io /////////////////////////////
519
520int libradosstriper::RadosStriperImpl::write(const std::string& soid,
521 const bufferlist& bl,
522 size_t len,
523 uint64_t off)
524{
525 // open the object. This will create it if needed, retrieve its layout
526 // and size and take a shared lock on it
527 ceph_file_layout layout;
528 std::string lockCookie;
529 int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true);
530 if (rc) return rc;
531 return write_in_open_object(soid, layout, lockCookie, bl, len, off);
532}
533
534int libradosstriper::RadosStriperImpl::append(const std::string& soid,
535 const bufferlist& bl,
536 size_t len)
537{
538 // open the object. This will create it if needed, retrieve its layout
539 // and size and take a shared lock on it
540 ceph_file_layout layout;
541 uint64_t size = len;
542 std::string lockCookie;
543 int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false);
544 if (rc) return rc;
545 return write_in_open_object(soid, layout, lockCookie, bl, len, size);
546}
547
548int libradosstriper::RadosStriperImpl::write_full(const std::string& soid,
549 const bufferlist& bl)
550{
551 int rc = trunc(soid, 0);
552 if (rc && rc != -ENOENT) return rc; // ENOENT is obviously ok
553 return write(soid, bl, bl.length(), 0);
554}
555
556int libradosstriper::RadosStriperImpl::read(const std::string& soid,
557 bufferlist* bl,
558 size_t len,
559 uint64_t off)
560{
561 // create a completion object
562 librados::AioCompletionImpl c;
563 // call asynchronous method
564 int rc = aio_read(soid, &c, bl, len, off);
565 // and wait for completion
566 if (!rc) {
567 // wait for completion
568 c.wait_for_complete_and_cb();
569 // return result
570 rc = c.get_return_value();
571 }
572 return rc;
573}
574
575///////////////////////// asynchronous io /////////////////////////////
576
577int libradosstriper::RadosStriperImpl::aio_write(const std::string& soid,
578 librados::AioCompletionImpl *c,
579 const bufferlist& bl,
580 size_t len,
581 uint64_t off)
582{
583 ceph_file_layout layout;
584 std::string lockCookie;
585 int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true);
586 if (rc) return rc;
587 return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, off);
588}
589
590int libradosstriper::RadosStriperImpl::aio_append(const std::string& soid,
591 librados::AioCompletionImpl *c,
592 const bufferlist& bl,
593 size_t len)
594{
595 ceph_file_layout layout;
596 uint64_t size = len;
597 std::string lockCookie;
598 int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false);
599 if (rc) return rc;
600 // create a completion object
601 return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, size);
602}
603
604int libradosstriper::RadosStriperImpl::aio_write_full(const std::string& soid,
605 librados::AioCompletionImpl *c,
606 const bufferlist& bl)
607{
608 int rc = trunc(soid, 0);
609 if (rc) return rc;
610 return aio_write(soid, c, bl, bl.length(), 0);
611}
612
613static void rados_read_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg)
614{
224ce89b 615 auto cdata = reinterpret_cast<ReadCompletionData*>(arg);
7c673cae
FG
616 libradosstriper::MultiAioCompletionImpl *comp =
617 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
618 cdata->complete_unlock(comp->rval);
619 cdata->put();
620}
621
622static void striper_read_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
623{
224ce89b 624 auto cdata = reinterpret_cast<ReadCompletionData*>(arg);
7c673cae
FG
625 // launch the async unlocking of the object
626 cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion);
627 // complete the read part in parallel
628 libradosstriper::MultiAioCompletionImpl *comp =
629 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
630 cdata->complete_read(comp->rval);
631}
632
633static void rados_req_read_safe(rados_completion_t c, void *arg)
634{
224ce89b 635 auto data = reinterpret_cast<RadosReadCompletionData*>(arg);
7c673cae
FG
636 int rc = rados_aio_get_return_value(c);
637 // ENOENT means that we are dealing with a sparse file. This is fine,
638 // data (0s) will be created on the fly by the rados_req_read_complete method
639 if (rc == -ENOENT) rc = 0;
224ce89b 640 auto multiAioComp = data->m_multiAioCompl;
7c673cae
FG
641 multiAioComp->safe_request(rc);
642 data->put();
643}
644
645static void rados_req_read_complete(rados_completion_t c, void *arg)
646{
224ce89b 647 auto data = reinterpret_cast<RadosReadCompletionData*>(arg);
7c673cae
FG
648 int rc = rados_aio_get_return_value(c);
649 // We need to handle the case of sparse files here
650 if (rc == -ENOENT) {
651 // the object did not exist at all. This can happen for sparse files.
652 // we consider we've read 0 bytes and it will fall into next case
653 rc = 0;
654 }
655 if (rc >= 0 && (((uint64_t)rc) < data->m_expectedBytes)) {
656 // only partial data were present in the object (or the object did not
657 // even exist if we've gone through previous case).
658 // This is typical of sparse file and we need to complete with 0s.
659 unsigned int lenOfZeros = data->m_expectedBytes-rc;
660 unsigned int existingDataToZero = min(data->m_bl->length()-rc, lenOfZeros);
661 if (existingDataToZero > 0) {
662 data->m_bl->zero(rc, existingDataToZero);
663 }
664 if (lenOfZeros > existingDataToZero) {
665 ceph::bufferptr zeros(ceph::buffer::create(lenOfZeros-existingDataToZero));
666 zeros.zero();
667 data->m_bl->push_back(zeros);
668 }
669 rc = data->m_expectedBytes;
670 }
224ce89b 671 auto multiAioComp = data->m_multiAioCompl;
7c673cae
FG
672 multiAioComp->complete_request(rc);
673 data->put();
674}
675
676int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
677 librados::AioCompletionImpl *c,
678 bufferlist* bl,
679 size_t len,
680 uint64_t off)
681{
682 // open the object. This will retrieve its layout and size
683 // and take a shared lock on it
684 ceph_file_layout layout;
685 uint64_t size;
686 std::string lockCookie;
687 int rc = openStripedObjectForRead(soid, &layout, &size, &lockCookie);
688 if (rc) return rc;
689 // find out the actual number of bytes we can read
690 uint64_t read_len;
691 if (off >= size) {
692 // nothing to read ! We are done.
693 read_len = 0;
694 } else {
695 read_len = min(len, (size_t)(size-off));
696 }
697 // get list of extents to be read from
698 vector<ObjectExtent> *extents = new vector<ObjectExtent>();
699 if (read_len > 0) {
700 std::string format = soid + RADOS_OBJECT_EXTENSION_FORMAT;
701 file_layout_t l;
702 l.from_legacy(layout);
703 Striper::file_to_extents(cct(), format.c_str(), &l, off, read_len,
704 0, *extents);
705 }
706
707 // create a completion object and transfer ownership of extents and resultbl
708 vector<bufferlist> *resultbl = new vector<bufferlist>(extents->size());
709 ReadCompletionData *cdata = new ReadCompletionData(this, soid, lockCookie, c,
710 bl, extents, resultbl, 1);
711 c->is_read = true;
712 c->io = m_ioCtxImpl;
713 // create a completion for the unlocking of the striped object at the end of the read
714 librados::AioCompletion *unlock_completion =
715 librados::Rados::aio_create_completion(cdata, rados_read_aio_unlock_complete, 0);
716 cdata->m_unlockCompletion = unlock_completion;
717 // create the multiCompletion object handling the reads
224ce89b
WB
718 MultiAioCompletionImplPtr nc{new libradosstriper::MultiAioCompletionImpl,
719 false};
7c673cae
FG
720 nc->set_complete_callback(cdata, striper_read_aio_req_complete);
721 // go through the extents
722 int r = 0, i = 0;
723 for (vector<ObjectExtent>::iterator p = extents->begin(); p != extents->end(); ++p) {
724 // create a buffer list describing where to place data read from current extend
725 bufferlist *oid_bl = &((*resultbl)[i++]);
726 for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
727 q != p->buffer_extents.end();
728 ++q) {
729 bufferlist buffer_bl;
730 buffer_bl.substr_of(*bl, q->first, q->second);
731 oid_bl->append(buffer_bl);
732 }
733 // read all extends of a given object in one go
734 nc->add_request();
735 // we need 2 references on data as both rados_req_read_safe and rados_req_read_complete
736 // will release one
737 RadosReadCompletionData *data = new RadosReadCompletionData(nc, p->length, oid_bl, cct(), 2);
738 librados::AioCompletion *rados_completion =
739 librados::Rados::aio_create_completion(data, rados_req_read_complete, rados_req_read_safe);
740 r = m_ioCtx.aio_read(p->oid.name, rados_completion, oid_bl, p->length, p->offset);
741 rados_completion->release();
742 if (r < 0)
743 break;
744 }
745 nc->finish_adding_requests();
7c673cae
FG
746 return r;
747}
748
749int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid,
750 librados::AioCompletionImpl *c,
751 char* buf,
752 size_t len,
753 uint64_t off)
754{
755 // create a buffer list and store it inside the completion object
756 c->bl.clear();
757 c->bl.push_back(buffer::create_static(len, buf));
758 // call the bufferlist version of this method
759 return aio_read(soid, c, &c->bl, len, off);
760}
761
762int libradosstriper::RadosStriperImpl::aio_flush()
763{
764 int ret;
765 // pass to the rados level
766 ret = m_ioCtx.aio_flush();
767 if (ret < 0)
768 return ret;
769 //wait all CompletionData are released
770 lock.Lock();
771 while (m_refCnt > 1)
772 cond.Wait(lock);
773 lock.Unlock();
774 return ret;
775}
776
777///////////////////////// stat and deletion /////////////////////////////
778
779int libradosstriper::RadosStriperImpl::stat(const std::string& soid, uint64_t *psize, time_t *pmtime)
780{
781 // create a completion object
782 librados::AioCompletionImpl c;
783 // call asynchronous version of stat
784 int rc = aio_stat(soid, &c, psize, pmtime);
785 if (rc == 0) {
786 // wait for completion of the remove
787 c.wait_for_complete();
788 // get result
789 rc = c.get_return_value();
790 }
791 return rc;
792}
793
794static void striper_stat_aio_stat_complete(rados_completion_t c, void *arg) {
224ce89b 795 auto data = reinterpret_cast<BasicStatCompletionData*>(arg);
7c673cae
FG
796 int rc = rados_aio_get_return_value(c);
797 if (rc == -ENOENT) {
798 // remember this has failed
799 data->m_statRC = rc;
800 }
801 data->m_multiCompletion->complete_request(rc);
802 data->put();
803}
804
805static void striper_stat_aio_getxattr_complete(rados_completion_t c, void *arg) {
224ce89b 806 auto data = reinterpret_cast<BasicStatCompletionData*>(arg);
7c673cae
FG
807 int rc = rados_aio_get_return_value(c);
808 // We need to handle the case of sparse files here
809 if (rc < 0) {
810 // remember this has failed
811 data->m_getxattrRC = rc;
812 } else {
813 // this intermediate string allows to add a null terminator before calling strtol
814 std::string err;
815 std::string strsize(data->m_bl.c_str(), data->m_bl.length());
816 *data->m_psize = strict_strtoll(strsize.c_str(), 10, &err);
817 if (!err.empty()) {
818 lderr(data->m_striper->cct()) << XATTR_SIZE << " : " << err << dendl;
819 data->m_getxattrRC = -EINVAL;
820 }
821 rc = 0;
822 }
823 data->m_multiCompletion->complete_request(rc);
824 data->put();
825}
826
827static void striper_stat_aio_req_complete(rados_striper_multi_completion_t c,
828 void *arg) {
224ce89b 829 auto data = reinterpret_cast<BasicStatCompletionData*>(arg);
7c673cae
FG
830 if (data->m_statRC) {
831 data->complete(data->m_statRC);
832 } else {
833 if (data->m_getxattrRC < 0) {
834 data->complete(data->m_getxattrRC);
835 } else {
836 data->complete(0);
837 }
838 }
839 data->put();
840}
841
842template<class TimeType>
843int libradosstriper::RadosStriperImpl::aio_generic_stat
844(const std::string& soid,
845 librados::AioCompletionImpl *c,
846 uint64_t *psize,
847 TimeType *pmtime,
848 typename libradosstriper::RadosStriperImpl::StatFunction<TimeType>::Type statFunction)
849{
850 // use a MultiAioCompletion object for dealing with the fact
851 // that we'll do 2 asynchronous calls in parallel
224ce89b
WB
852 MultiAioCompletionImplPtr multi_completion{
853 new libradosstriper::MultiAioCompletionImpl, false};
7c673cae
FG
854 // Data object used for passing context to asynchronous calls
855 std::string firstObjOid = getObjectId(soid, 0);
856 StatCompletionData<TimeType> *cdata =
857 new StatCompletionData<TimeType>(this, firstObjOid, c,
224ce89b 858 multi_completion.get(), psize, pmtime, 4);
7c673cae
FG
859 multi_completion->set_complete_callback(cdata, striper_stat_aio_req_complete);
860 // use a regular AioCompletion for the stat async call
861 librados::AioCompletion *stat_completion =
862 librados::Rados::aio_create_completion(cdata, striper_stat_aio_stat_complete, 0);
863 multi_completion->add_safe_request();
864 object_t obj(firstObjOid);
865 int rc = (m_ioCtxImpl->*statFunction)(obj, stat_completion->pc,
866 &cdata->m_objectSize, cdata->m_pmtime);
867 stat_completion->release();
868 if (rc < 0) {
869 // nothing is really started so cancel everything
7c673cae
FG
870 delete cdata;
871 return rc;
872 }
873 // use a regular AioCompletion for the getxattr async call
874 librados::AioCompletion *getxattr_completion =
875 librados::Rados::aio_create_completion(cdata, striper_stat_aio_getxattr_complete, 0);
876 multi_completion->add_safe_request();
877 // in parallel, get the pmsize from the first object asynchronously
878 rc = m_ioCtxImpl->aio_getxattr(obj, getxattr_completion->pc,
879 XATTR_SIZE, cdata->m_bl);
880 getxattr_completion->release();
881 multi_completion->finish_adding_requests();
882 if (rc < 0) {
883 // the async stat is ongoing, so we need to go on
884 // we mark the getxattr as failed in the data object
885 cdata->m_getxattrRC = rc;
886 multi_completion->complete_request(rc);
7c673cae
FG
887 return rc;
888 }
889 cdata->put();
7c673cae
FG
890 return 0;
891}
892
893int libradosstriper::RadosStriperImpl::aio_stat(const std::string& soid,
894 librados::AioCompletionImpl *c,
895 uint64_t *psize,
896 time_t *pmtime)
897{
898 return aio_generic_stat<time_t>(soid, c, psize, pmtime, &librados::IoCtxImpl::aio_stat);
899}
900
901int libradosstriper::RadosStriperImpl::stat2(const std::string& soid, uint64_t *psize, struct timespec *pts)
902{
903 // create a completion object
904 librados::AioCompletionImpl c;
905 // call asynchronous version of stat
906 int rc = aio_stat2(soid, &c, psize, pts);
907 if (rc == 0) {
908 // wait for completion of the remove
909 c.wait_for_complete_and_cb();
910 // get result
911 rc = c.get_return_value();
912 }
913 return rc;
914}
915
916int libradosstriper::RadosStriperImpl::aio_stat2(const std::string& soid,
917 librados::AioCompletionImpl *c,
918 uint64_t *psize,
919 struct timespec *pts)
920{
921 return aio_generic_stat<struct timespec>(soid, c, psize, pts, &librados::IoCtxImpl::aio_stat2);
922}
923
924static void rados_req_remove_complete(rados_completion_t c, void *arg)
925{
224ce89b 926 auto cdata = reinterpret_cast<RadosRemoveCompletionData*>(arg);
7c673cae
FG
927 int rc = rados_aio_get_return_value(c);
928 // in case the object did not exist, it means we had a sparse file, all is fine
929 if (rc == -ENOENT) {
930 rc = 0;
931 }
932 cdata->m_multiAioCompl->complete_request(rc);
933 cdata->put();
934}
935
936static void rados_req_remove_safe(rados_completion_t c, void *arg)
937{
224ce89b 938 auto cdata = reinterpret_cast<RadosRemoveCompletionData*>(arg);
7c673cae
FG
939 int rc = rados_aio_get_return_value(c);
940 // in case the object did not exist, it means we had a sparse file, all is fine
941 if (rc == -ENOENT) {
942 rc = 0;
943 }
944 cdata->m_multiAioCompl->safe_request(rc);
945 cdata->put();
946}
947
948static void striper_remove_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
949{
224ce89b 950 auto cdata = reinterpret_cast<RemoveCompletionData*>(arg);
7c673cae
FG
951 libradosstriper::MultiAioCompletionImpl *comp =
952 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
953 ldout(cdata->m_striper->cct(), 10)
954 << "RadosStriperImpl : striper_remove_aio_req_complete called for "
955 << cdata->m_soid << dendl;
956 int rc = comp->rval;
957 if (rc == 0) {
958 // All went fine, synchronously remove first object
959 rc = cdata->m_striper->m_ioCtx.remove(cdata->m_striper->getObjectId(cdata->m_soid, 0),
960 cdata->flags);
961 } else {
962 lderr(cdata->m_striper->cct())
963 << "RadosStriperImpl : deletion/truncation incomplete for " << cdata->m_soid
964 << ", as errors were encountered. The file is left present but it's content "
965 << " has been partially removed"
966 << dendl;
967 }
968 cdata->complete(rc);
969 cdata->put();
970}
971
972int libradosstriper::RadosStriperImpl::remove(const std::string& soid, int flags)
973{
974 // create a completion object
975 librados::AioCompletionImpl c;
976 // call asynchronous version of remove
977 int rc = aio_remove(soid, &c, flags);
978 if (rc == 0) {
979 // wait for completion of the remove
980 c.wait_for_complete_and_cb();
981 // get result
982 rc = c.get_return_value();
983 }
984 return rc;
985}
986
987int libradosstriper::RadosStriperImpl::aio_remove(const std::string& soid,
988 librados::AioCompletionImpl *c,
989 int flags)
990{
991 // the RemoveCompletionData object will lock the given soid for the duration
992 // of the removal
993 std::string lockCookie = getUUID();
994 int rc = m_ioCtx.lock_exclusive(getObjectId(soid, 0), RADOS_LOCK_NAME, lockCookie, "", 0, 0);
995 if (rc) return rc;
996 // create CompletionData for the async remove call
997 RemoveCompletionData *cdata = new RemoveCompletionData(this, soid, lockCookie, c, flags);
224ce89b
WB
998 MultiAioCompletionImplPtr multi_completion{
999 new libradosstriper::MultiAioCompletionImpl, false};
7c673cae
FG
1000 multi_completion->set_complete_callback(cdata, striper_remove_aio_req_complete);
1001 // call asynchronous internal version of remove
1002 ldout(cct(), 10)
1003 << "RadosStriperImpl : Aio_remove starting for "
1004 << soid << dendl;
1005 rc = internal_aio_remove(soid, multi_completion);
7c673cae
FG
1006 return rc;
1007}
1008
224ce89b
WB
1009int libradosstriper::RadosStriperImpl::internal_aio_remove(
1010 const std::string& soid,
1011 MultiAioCompletionImplPtr multi_completion,
7c673cae
FG
1012 int flags)
1013{
1014 std::string firstObjOid = getObjectId(soid, 0);
1015 try {
1016 // check size and get number of rados objects to delete
1017 uint64_t nb_objects = 0;
1018 bufferlist bl2;
1019 int rc = getxattr(soid, XATTR_SIZE, bl2);
1020 if (rc < 0) {
1021 // no object size (or not able to get it)
1022 // try to find the number of object "by hand"
1023 uint64_t psize;
1024 time_t pmtime;
1025 while (!m_ioCtx.stat(getObjectId(soid, nb_objects), &psize, &pmtime)) {
1026 nb_objects++;
1027 }
1028 } else {
1029 // count total number of rados objects in the striped object
1030 std::string err;
1031 // this intermediate string allows to add a null terminator before calling strtol
1032 std::string strsize(bl2.c_str(), bl2.length());
1033 uint64_t size = strict_strtoll(strsize.c_str(), 10, &err);
1034 if (!err.empty()) {
1035 lderr(cct()) << XATTR_SIZE << " : " << err << dendl;
1036
1037 return -EINVAL;
1038 }
1039 uint64_t object_size = m_layout.fl_object_size;
1040 uint64_t su = m_layout.fl_stripe_unit;
1041 uint64_t stripe_count = m_layout.fl_stripe_count;
1042 uint64_t nb_complete_sets = size / (object_size*stripe_count);
1043 uint64_t remaining_data = size % (object_size*stripe_count);
1044 uint64_t remaining_stripe_units = (remaining_data + su -1) / su;
1045 uint64_t remaining_objects = std::min(remaining_stripe_units, stripe_count);
1046 nb_objects = nb_complete_sets * stripe_count + remaining_objects;
1047 }
1048 // delete rados objects in reverse order
1049 // Note that we do not drop the first object. This one will only be dropped
1050 // if all other removals have been successful, and this is done in the
1051 // callback of the multi_completion object
1052 int rcr = 0;
1053 for (int i = nb_objects-1; i >= 1; i--) {
1054 multi_completion->add_request();
1055 RadosRemoveCompletionData *data =
1056 new RadosRemoveCompletionData(multi_completion, cct());
1057 librados::AioCompletion *rados_completion =
1058 librados::Rados::aio_create_completion(data,
1059 rados_req_remove_complete,
1060 rados_req_remove_safe);
1061 if (flags == 0) {
1062 rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion);
1063 } else {
1064 rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion, flags);
1065 }
1066 rados_completion->release();
1067 if (rcr < 0 and -ENOENT != rcr) {
1068 lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid
1069 << ", as " << getObjectId(soid, i) << " could not be deleted (rc=" << rc << ")"
1070 << dendl;
1071 break;
1072 }
1073 }
1074 // we are over adding requests to the multi_completion object
1075 multi_completion->finish_adding_requests();
1076 // return
1077 return rcr;
1078 } catch (ErrorCode &e) {
1079 // errror caught when trying to take the exclusive lock
1080 return e.m_code;
1081 }
1082
1083}
1084
1085int libradosstriper::RadosStriperImpl::trunc(const std::string& soid, uint64_t size)
1086{
1087 // lock the object in exclusive mode
1088 std::string firstObjOid = getObjectId(soid, 0);
1089 librados::ObjectWriteOperation op;
1090 op.assert_exists();
1091 std::string lockCookie = RadosStriperImpl::getUUID();
1092 utime_t dur = utime_t();
1093 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_EXCLUSIVE, lockCookie, "", "", dur, 0);
1094 int rc = m_ioCtx.operate(firstObjOid, &op);
1095 if (rc) return rc;
1096 // load layout and size
1097 ceph_file_layout layout;
1098 uint64_t original_size;
1099 rc = internal_get_layout_and_size(firstObjOid, &layout, &original_size);
1100 if (!rc) {
1101 if (size < original_size) {
1102 rc = truncate(soid, original_size, size, layout);
1103 } else if (size > original_size) {
1104 rc = grow(soid, original_size, size, layout);
1105 }
1106 }
1107 // unlock object, ignore return code as we cannot do much
1108 m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie);
1109 // final return
1110 return rc;
1111}
1112
1113
1114///////////////////////// private helpers /////////////////////////////
1115
1116std::string libradosstriper::RadosStriperImpl::getObjectId(const object_t& soid,
1117 long long unsigned objectno)
1118{
1119 std::ostringstream s;
1120 s << soid << '.' << std::setfill ('0') << std::setw(16) << std::hex << objectno;
1121 return s.str();
1122}
1123
1124void libradosstriper::RadosStriperImpl::unlockObject(const std::string& soid,
1125 const std::string& lockCookie)
1126{
1127 // unlock the shared lock on the first rados object
1128 std::string firstObjOid = getObjectId(soid, 0);
1129 m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie);
1130}
1131
1132void libradosstriper::RadosStriperImpl::aio_unlockObject(const std::string& soid,
1133 const std::string& lockCookie,
1134 librados::AioCompletion *c)
1135{
1136 // unlock the shared lock on the first rados object
1137 std::string firstObjOid = getObjectId(soid, 0);
1138 m_ioCtx.aio_unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie, c);
1139}
1140
1141static void rados_write_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg)
1142{
224ce89b 1143 auto cdata = reinterpret_cast<WriteCompletionData*>(arg);
7c673cae
FG
1144 libradosstriper::MultiAioCompletionImpl *comp =
1145 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1146 cdata->complete_unlock(comp->rval);
1147 cdata->put();
1148}
1149
1150static void striper_write_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
1151{
224ce89b 1152 auto cdata = reinterpret_cast<WriteCompletionData*>(arg);
7c673cae
FG
1153 // launch the async unlocking of the object
1154 cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion);
1155 // complete the write part in parallel
1156 libradosstriper::MultiAioCompletionImpl *comp =
1157 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1158 cdata->complete_write(comp->rval);
1159 cdata->put();
1160}
1161
1162static void striper_write_aio_req_safe(rados_striper_multi_completion_t c, void *arg)
1163{
224ce89b 1164 auto cdata = reinterpret_cast<WriteCompletionData*>(arg);
7c673cae
FG
1165 libradosstriper::MultiAioCompletionImpl *comp =
1166 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1167 cdata->safe(comp->rval);
1168 cdata->put();
1169}
1170
1171int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string& soid,
1172 const ceph_file_layout& layout,
1173 const std::string& lockCookie,
1174 const bufferlist& bl,
1175 size_t len,
1176 uint64_t off) {
1177 // create a completion object to be passed to the callbacks of the multicompletion
1178 // we need 3 references as striper_write_aio_req_complete will release two and
1179 // striper_write_aio_req_safe will release one
1180 WriteCompletionData *cdata = new WriteCompletionData(this, soid, lockCookie, 0, 3);
1181 cdata->get(); // local ref
1182 // create a completion object for the unlocking of the striped object at the end of the write
1183 librados::AioCompletion *unlock_completion =
1184 librados::Rados::aio_create_completion(cdata, rados_write_aio_unlock_complete, 0);
1185 cdata->m_unlockCompletion = unlock_completion;
1186 // create the multicompletion that will handle the write completion
224ce89b
WB
1187 MultiAioCompletionImplPtr c{new libradosstriper::MultiAioCompletionImpl,
1188 false};
7c673cae
FG
1189 c->set_complete_callback(cdata, striper_write_aio_req_complete);
1190 c->set_safe_callback(cdata, striper_write_aio_req_safe);
1191 // call the asynchronous API
1192 int rc = internal_aio_write(soid, c, bl, len, off, layout);
1193 if (!rc) {
1194 // wait for completion and safety of data
1195 c->wait_for_complete_and_cb();
1196 c->wait_for_safe_and_cb();
1197 // wait for the unlocking
1198 unlock_completion->wait_for_complete();
1199 // return result
1200 rc = c->get_return_value();
1201 }
7c673cae
FG
1202 cdata->put();
1203 return rc;
1204}
1205
1206int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string& soid,
1207 librados::AioCompletionImpl *c,
1208 const ceph_file_layout& layout,
1209 const std::string& lockCookie,
1210 const bufferlist& bl,
1211 size_t len,
1212 uint64_t off) {
1213 // create a completion object to be passed to the callbacks of the multicompletion
1214 // we need 3 references as striper_write_aio_req_complete will release two and
1215 // striper_write_aio_req_safe will release one
1216 WriteCompletionData *cdata = new WriteCompletionData(this, soid, lockCookie, c, 3);
1217 cdata->get(); // local ref
1218 m_ioCtxImpl->get();
1219 c->io = m_ioCtxImpl;
1220 // create a completion object for the unlocking of the striped object at the end of the write
1221 librados::AioCompletion *unlock_completion =
1222 librados::Rados::aio_create_completion(cdata, rados_write_aio_unlock_complete, 0);
1223 cdata->m_unlockCompletion = unlock_completion;
1224 // create the multicompletion that will handle the write completion
224ce89b
WB
1225 libradosstriper::MultiAioCompletionImplPtr nc{
1226 new libradosstriper::MultiAioCompletionImpl, false};
7c673cae
FG
1227 nc->set_complete_callback(cdata, striper_write_aio_req_complete);
1228 nc->set_safe_callback(cdata, striper_write_aio_req_safe);
1229 // internal asynchronous API
1230 int rc = internal_aio_write(soid, nc, bl, len, off, layout);
7c673cae
FG
1231 cdata->put();
1232 return rc;
1233}
1234
1235static void rados_req_write_safe(rados_completion_t c, void *arg)
1236{
1237 libradosstriper::MultiAioCompletionImpl *comp =
1238 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg);
1239 comp->safe_request(rados_aio_get_return_value(c));
1240}
1241
1242static void rados_req_write_complete(rados_completion_t c, void *arg)
1243{
1244 libradosstriper::MultiAioCompletionImpl *comp =
1245 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg);
1246 comp->complete_request(rados_aio_get_return_value(c));
1247}
1248
1249int
1250libradosstriper::RadosStriperImpl::internal_aio_write(const std::string& soid,
224ce89b 1251 libradosstriper::MultiAioCompletionImplPtr c,
7c673cae
FG
1252 const bufferlist& bl,
1253 size_t len,
1254 uint64_t off,
1255 const ceph_file_layout& layout)
1256{
1257 int r = 0;
1258 // Do not try anything if we are called with empty buffer,
1259 // file_to_extents would raise an exception
1260 if (len > 0) {
1261 // get list of extents to be written to
1262 vector<ObjectExtent> extents;
1263 std::string format = soid + RADOS_OBJECT_EXTENSION_FORMAT;
1264 file_layout_t l;
1265 l.from_legacy(layout);
1266 Striper::file_to_extents(cct(), format.c_str(), &l, off, len, 0, extents);
1267 // go through the extents
1268 for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
1269 // assemble pieces of a given object into a single buffer list
1270 bufferlist oid_bl;
1271 for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
1272 q != p->buffer_extents.end();
1273 ++q) {
1274 bufferlist buffer_bl;
1275 buffer_bl.substr_of(bl, q->first, q->second);
1276 oid_bl.append(buffer_bl);
1277 }
1278 // and write the object
1279 c->add_request();
1280 librados::AioCompletion *rados_completion =
224ce89b
WB
1281 librados::Rados::aio_create_completion(c.get(),
1282 rados_req_write_complete,
1283 rados_req_write_safe);
1284 r = m_ioCtx.aio_write(p->oid.name, rados_completion, oid_bl,
1285 p->length, p->offset);
7c673cae
FG
1286 rados_completion->release();
1287 if (r < 0)
1288 break;
1289 }
1290 }
1291 c->finish_adding_requests();
1292 return r;
1293}
1294
1295int libradosstriper::RadosStriperImpl::extract_uint32_attr
1296(std::map<std::string, bufferlist> &attrs,
1297 const std::string& key,
1298 ceph_le32 *value)
1299{
1300 std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key);
1301 if (attrsIt != attrs.end()) {
1302 // this intermediate string allows to add a null terminator before calling strtol
1303 std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length());
1304 std::string err;
1305 *value = strict_strtol(strvalue.c_str(), 10, &err);
1306 if (!err.empty()) {
1307 lderr(cct()) << key << " : " << err << dendl;
1308 return -EINVAL;
1309 }
1310 } else {
1311 return -ENOENT;
1312 }
1313 return 0;
1314}
1315
1316int libradosstriper::RadosStriperImpl::extract_sizet_attr
1317(std::map<std::string, bufferlist> &attrs,
1318 const std::string& key,
1319 size_t *value)
1320{
1321 std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key);
1322 if (attrsIt != attrs.end()) {
1323 // this intermediate string allows to add a null terminator before calling strtol
1324 std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length());
1325 std::string err;
1326 *value = strict_strtoll(strvalue.c_str(), 10, &err);
1327 if (!err.empty()) {
1328 lderr(cct()) << key << " : " << err << dendl;
1329 return -EINVAL;
1330 }
1331 } else {
1332 return -ENOENT;
1333 }
1334 return 0;
1335}
1336
1337int libradosstriper::RadosStriperImpl::internal_get_layout_and_size(
1338 const std::string& oid,
1339 ceph_file_layout *layout,
1340 uint64_t *size)
1341{
1342 // get external attributes of the first rados object
1343 std::map<std::string, bufferlist> attrs;
1344 int rc = m_ioCtx.getxattrs(oid, attrs);
1345 if (rc) return rc;
1346 // deal with stripe_unit
1347 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_UNIT, &layout->fl_stripe_unit);
1348 if (rc) return rc;
1349 // deal with stripe_count
1350 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_COUNT, &layout->fl_stripe_count);
1351 if (rc) return rc;
1352 // deal with object_size
1353 rc = extract_uint32_attr(attrs, XATTR_LAYOUT_OBJECT_SIZE, &layout->fl_object_size);
1354 if (rc) return rc;
1355 // deal with size
1356 size_t ssize;
1357 rc = extract_sizet_attr(attrs, XATTR_SIZE, &ssize);
1358 if (rc) {
1359 return rc;
1360 }
1361 *size = ssize;
1362 // make valgrind happy by setting unused fl_pg_pool
1363 layout->fl_pg_pool = 0;
1364 return 0;
1365}
1366
1367int libradosstriper::RadosStriperImpl::openStripedObjectForRead(
1368 const std::string& soid,
1369 ceph_file_layout *layout,
1370 uint64_t *size,
1371 std::string *lockCookie)
1372{
1373 // take a lock the first rados object, if it exists and gets its size
1374 // check, lock and size reading must be atomic and are thus done within a single operation
1375 librados::ObjectWriteOperation op;
1376 op.assert_exists();
1377 *lockCookie = getUUID();
1378 utime_t dur = utime_t();
1379 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_SHARED, *lockCookie, "Tag", "", dur, 0);
1380 std::string firstObjOid = getObjectId(soid, 0);
1381 int rc = m_ioCtx.operate(firstObjOid, &op);
1382 if (rc) {
1383 // error case (including -ENOENT)
1384 return rc;
1385 }
1386 rc = internal_get_layout_and_size(firstObjOid, layout, size);
1387 if (rc) {
1388 unlockObject(soid, *lockCookie);
1389 lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : "
1390 << "could not load layout and size for "
1391 << soid << " : rc = " << rc << dendl;
1392 }
1393 return rc;
1394}
1395
1396int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string& soid,
1397 ceph_file_layout *layout,
1398 uint64_t *size,
1399 std::string *lockCookie,
1400 bool isFileSizeAbsolute)
1401{
1402 // take a lock the first rados object, if it exists
1403 // check and lock must be atomic and are thus done within a single operation
1404 librados::ObjectWriteOperation op;
1405 op.assert_exists();
1406 *lockCookie = getUUID();
1407 utime_t dur = utime_t();
1408 rados::cls::lock::lock(&op, RADOS_LOCK_NAME, LOCK_SHARED, *lockCookie, "Tag", "", dur, 0);
1409 std::string firstObjOid = getObjectId(soid, 0);
1410 int rc = m_ioCtx.operate(firstObjOid, &op);
1411 if (rc) {
1412 if (rc == -ENOENT) {
1413 // object does not exist, delegate to createEmptyStripedObject
1414 int rc = createAndOpenStripedObject(soid, layout, *size, lockCookie, isFileSizeAbsolute);
1415 // return original size
1416 *size = 0;
1417 return rc;
1418 } else {
1419 return rc;
1420 }
1421 }
1422 // all fine
1423 uint64_t curSize;
1424 rc = internal_get_layout_and_size(firstObjOid, layout, &curSize);
1425 if (rc) {
1426 unlockObject(soid, *lockCookie);
1427 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1428 << "could not load layout and size for "
1429 << soid << " : rc = " << rc << dendl;
1430 return rc;
1431 }
1432 // atomically update object size, only if smaller than current one
1433 if (!isFileSizeAbsolute)
1434 *size += curSize;
1435 librados::ObjectWriteOperation writeOp;
1436 writeOp.cmpxattr(XATTR_SIZE, LIBRADOS_CMPXATTR_OP_GT, *size);
1437 std::ostringstream oss;
1438 oss << *size;
1439 bufferlist bl;
1440 bl.append(oss.str());
1441 writeOp.setxattr(XATTR_SIZE, bl);
1442 rc = m_ioCtx.operate(firstObjOid, &writeOp);
1443 // return current size
1444 *size = curSize;
1445 // handle case where objectsize is already bigger than size
1446 if (-ECANCELED == rc)
1447 rc = 0;
1448 if (rc) {
1449 unlockObject(soid, *lockCookie);
1450 lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : "
1451 << "could not set new size for "
1452 << soid << " : rc = " << rc << dendl;
1453 }
1454 return rc;
1455}
1456
1457int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string& soid,
1458 ceph_file_layout *layout,
1459 uint64_t size,
1460 std::string *lockCookie,
1461 bool isFileSizeAbsolute)
1462{
1463 // build atomic write operation
1464 librados::ObjectWriteOperation writeOp;
1465 writeOp.create(true);
1466 // object_size
1467 std::ostringstream oss_object_size;
1468 oss_object_size << m_layout.fl_object_size;
1469 bufferlist bl_object_size;
1470 bl_object_size.append(oss_object_size.str());
1471 writeOp.setxattr(XATTR_LAYOUT_OBJECT_SIZE, bl_object_size);
1472 // stripe unit
1473 std::ostringstream oss_stripe_unit;
1474 oss_stripe_unit << m_layout.fl_stripe_unit;
1475 bufferlist bl_stripe_unit;
1476 bl_stripe_unit.append(oss_stripe_unit.str());
1477 writeOp.setxattr(XATTR_LAYOUT_STRIPE_UNIT, bl_stripe_unit);
1478 // stripe count
1479 std::ostringstream oss_stripe_count;
1480 oss_stripe_count << m_layout.fl_stripe_count;
1481 bufferlist bl_stripe_count;
1482 bl_stripe_count.append(oss_stripe_count.str());
1483 writeOp.setxattr(XATTR_LAYOUT_STRIPE_COUNT, bl_stripe_count);
1484 // size
1485 std::ostringstream oss_size;
1486 oss_size << (isFileSizeAbsolute?size:0);
1487 bufferlist bl_size;
1488 bl_size.append(oss_size.str());
1489 writeOp.setxattr(XATTR_SIZE, bl_size);
1490 // effectively change attributes
1491 std::string firstObjOid = getObjectId(soid, 0);
1492 int rc = m_ioCtx.operate(firstObjOid, &writeOp);
1493 // in case of error (but no EEXIST which would mean the object existed), return
1494 if (rc && -EEXIST != rc) return rc;
1495 // Otherwise open the object
1496 uint64_t fileSize = size;
1497 return openStripedObjectForWrite(soid, layout, &fileSize, lockCookie, isFileSizeAbsolute);
1498}
1499
1500static void striper_truncate_aio_req_complete(rados_striper_multi_completion_t c, void *arg)
1501{
224ce89b 1502 auto cdata = reinterpret_cast<TruncateCompletionData*>(arg);
7c673cae
FG
1503 libradosstriper::MultiAioCompletionImpl *comp =
1504 reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c);
1505 if (0 == comp->rval) {
1506 // all went fine, change size in the external attributes
1507 std::ostringstream oss;
1508 oss << cdata->m_size;
1509 bufferlist bl;
1510 bl.append(oss.str());
1511 cdata->m_striper->setxattr(cdata->m_soid, XATTR_SIZE, bl);
1512 }
1513 cdata->put();
1514}
1515
1516int libradosstriper::RadosStriperImpl::truncate(const std::string& soid,
1517 uint64_t original_size,
1518 uint64_t size,
1519 ceph_file_layout &layout)
1520{
1521 TruncateCompletionData *cdata = new TruncateCompletionData(this, soid, size);
224ce89b
WB
1522 libradosstriper::MultiAioCompletionImplPtr multi_completion{
1523 new libradosstriper::MultiAioCompletionImpl, false};
7c673cae
FG
1524 multi_completion->set_complete_callback(cdata, striper_truncate_aio_req_complete);
1525 // call asynchrous version of truncate
1526 int rc = aio_truncate(soid, multi_completion, original_size, size, layout);
1527 // wait for completion of the truncation
1528 multi_completion->finish_adding_requests();
1529 multi_completion->wait_for_complete_and_cb();
1530 // return result
1531 if (rc == 0) {
1532 rc = multi_completion->get_return_value();
1533 }
7c673cae
FG
1534 return rc;
1535}
1536
1537int libradosstriper::RadosStriperImpl::aio_truncate
1538(const std::string& soid,
224ce89b 1539 libradosstriper::MultiAioCompletionImplPtr multi_completion,
7c673cae
FG
1540 uint64_t original_size,
1541 uint64_t size,
1542 ceph_file_layout &layout)
1543{
1544 // handle the underlying rados objects. 3 cases here :
1545 // -- the objects belonging to object sets entirely located
1546 // before the truncation are unchanged
1547 // -- the objects belonging to the object set where the
1548 // truncation took place are truncated or removed
1549 // -- the objects belonging to object sets entirely located
1550 // after the truncation are removed
1551 // Note that we do it backward and that we change the size in
1552 // the external attributes only at the end. This make sure that
1553 // no rados object stays behind if we remove the striped object
1554 // after a truncation has failed
1555 uint64_t trunc_objectsetno = size / layout.fl_object_size / layout.fl_stripe_count;
1556 uint64_t last_objectsetno = original_size / layout.fl_object_size / layout.fl_stripe_count;
1557 bool exists = false;
1558 for (int64_t objectno = (last_objectsetno+1) * layout.fl_stripe_count-1;
1559 objectno >= (int64_t)((trunc_objectsetno + 1) * layout.fl_stripe_count);
1560 objectno--) {
1561 // if no object existed so far, check object existence
1562 if (!exists) {
1563 uint64_t nb_full_object_set = objectno / layout.fl_stripe_count;
1564 uint64_t object_index_in_set = objectno % layout.fl_stripe_count;
1565 uint64_t set_start_off = nb_full_object_set * layout.fl_object_size * layout.fl_stripe_count;
1566 uint64_t object_start_off = set_start_off + object_index_in_set * layout.fl_stripe_unit;
1567 exists = (original_size > object_start_off);
1568 }
1569 if (exists) {
1570 // remove asynchronously
1571 multi_completion->add_request();
1572 RadosRemoveCompletionData *data =
1573 new RadosRemoveCompletionData(multi_completion, cct());
1574 librados::AioCompletion *rados_completion =
1575 librados::Rados::aio_create_completion(data,
1576 rados_req_remove_complete,
1577 rados_req_remove_safe);
1578 int rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion);
1579 rados_completion->release();
1580 // in case the object did not exist, it means we had a sparse file, all is fine
1581 if (rc && rc != -ENOENT) return rc;
1582 }
1583 }
1584 for (int64_t objectno = ((trunc_objectsetno + 1) * layout.fl_stripe_count) -1;
1585 objectno >= (int64_t)(trunc_objectsetno * layout.fl_stripe_count);
1586 objectno--) {
1587 // if no object existed so far, check object existence
1588 if (!exists) {
1589 uint64_t object_start_off = ((objectno / layout.fl_stripe_count) * layout.fl_object_size) +
1590 ((objectno % layout.fl_stripe_count) * layout.fl_stripe_unit);
1591 exists = (original_size > object_start_off);
1592 }
1593 if (exists) {
1594 // truncate
1595 file_layout_t l;
1596 l.from_legacy(layout);
1597 uint64_t new_object_size = Striper::object_truncate_size(cct(), &l, objectno, size);
1598 int rc;
1599 if (new_object_size > 0 or 0 == objectno) {
1600 // trunc is synchronous as there is no async version
1601 // but note that only a single object will be truncated
1602 // reducing the overload to a fixed amount
1603 rc = m_ioCtx.trunc(getObjectId(soid, objectno), new_object_size);
1604 } else {
1605 // removes are asynchronous in order to speed up truncations of big files
1606 multi_completion->add_request();
1607 RadosRemoveCompletionData *data =
1608 new RadosRemoveCompletionData(multi_completion, cct());
1609 librados::AioCompletion *rados_completion =
1610 librados::Rados::aio_create_completion(data,
1611 rados_req_remove_complete,
1612 rados_req_remove_safe);
1613 rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion);
1614 rados_completion->release();
1615 }
1616 // in case the object did not exist, it means we had a sparse file, all is fine
1617 if (rc && rc != -ENOENT) return rc;
1618 }
1619 }
1620 return 0;
1621}
1622
1623int libradosstriper::RadosStriperImpl::grow(const std::string& soid,
1624 uint64_t original_size,
1625 uint64_t size,
1626 ceph_file_layout &layout)
1627{
1628 // handle the underlying rados objects. As we support sparse objects,
1629 // we only have to change the size in the external attributes
1630 std::ostringstream oss;
1631 oss << size;
1632 bufferlist bl;
1633 bl.append(oss.str());
1634 int rc = m_ioCtx.setxattr(getObjectId(soid, 0), XATTR_SIZE, bl);
1635 return rc;
1636}
1637
1638std::string libradosstriper::RadosStriperImpl::getUUID()
1639{
1640 struct uuid_d uuid;
1641 uuid.generate_random();
1642 char suuid[37];
1643 uuid.print(suuid);
1644 return std::string(suuid);
1645}