]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
c07f9fc5 FG |
15 | #include <boost/algorithm/string/replace.hpp> |
16 | ||
7c673cae FG |
17 | #include "libradosstriper/RadosStriperImpl.h" |
18 | ||
19 | #include <errno.h> | |
20 | ||
21 | #include <sstream> | |
22 | #include <iomanip> | |
23 | #include <algorithm> | |
24 | ||
25 | #include "include/types.h" | |
26 | #include "include/uuid.h" | |
27 | #include "include/ceph_fs.h" | |
28 | #include "common/dout.h" | |
29 | #include "common/strtol.h" | |
9f95a23c | 30 | #include "common/RefCountedObj.h" |
7c673cae | 31 | #include "osdc/Striper.h" |
7c673cae FG |
32 | #include "librados/AioCompletionImpl.h" |
33 | #include <cls/lock/cls_lock_client.h> | |
34 | ||
35 | /* | |
36 | * This file contents the actual implementation of the rados striped objects interface. | |
37 | * | |
38 | * Striped objects are stored in rados in a set of regular rados objects, after their | |
39 | * content has been striped using the osdc/Striper interface. | |
40 | * | |
41 | * The external attributes of the striped object are mapped to the attributes of the | |
42 | * first underlying object. This first object has a set of extra external attributes | |
43 | * storing the layout of the striped object for future read back. These attributes are : | |
44 | * - striper.layout.object_size : the size of rados objects used. | |
45 | * Must be a multiple of striper.layout.stripe_unit | |
46 | * - striper.layout.stripe_unit : the size of a stripe unit | |
47 | * - striper.layout.stripe_count : the number of stripes used | |
48 | * - striper.size : total striped object size | |
49 | * | |
50 | * In general operations on striped objects are not atomic. | |
51 | * However, a certain number of safety guards have been put to make the interface closer | |
52 | * to atomicity : | |
53 | * - each data operation takes a shared lock on the first rados object for the | |
54 | * whole time of the operation | |
55 | * - the remove and trunc operations take an exclusive lock on the first rados object | |
56 | * for the whole time of the operation | |
57 | * This makes sure that no removal/truncation of a striped object occurs while | |
58 | * data operations are happening and vice versa. It thus makes sure that the layout | |
59 | * of a striped object does not change during data operation, which is essential for | |
60 | * data consistency. | |
61 | * | |
62 | * Still the writing to a striped object is not atomic. This means in particular that | |
63 | * the size of an object may not be in sync with its content at all times. | |
11fdf7f2 | 64 | * As the size is always guaranteed to be updated first and in an atomic way, and as |
7c673cae FG |
65 | * sparse striped objects are supported (see below), what will typically happen is |
66 | * that a reader that comes too soon after a write will read 0s instead of the actual | |
67 | * data. | |
68 | * | |
69 | * Note that remove handles the pieces of the striped object in reverse order, | |
70 | * so that the head object is removed last, making the completion of the deletion atomic. | |
71 | * | |
72 | * Striped objects can be sparse, typically in case data was written at the end of the | |
73 | * striped object only. In such a case, some rados objects constituing the striped object | |
74 | * may be missing. Other can be partial (only the beginning will have data) | |
75 | * When dealing with such sparse striped files, missing objects are detected and | |
76 | * considered as full of 0s. They are however not created until real data is written | |
77 | * to them. | |
78 | * | |
79 | * There are a number of missing features/improvements that could be implemented. | |
80 | * Here are some ideas : | |
81 | * - implementation of missing entry points (compared to rados) | |
82 | * In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ... | |
83 | * | |
84 | */ | |
85 | ||
86 | #define dout_subsys ceph_subsys_rados | |
87 | #undef dout_prefix | |
88 | #define dout_prefix *_dout << "libradosstriper: " | |
89 | ||
90 | /// size of xattr buffer | |
91 | #define XATTR_BUFFER_SIZE 32 | |
92 | ||
93 | /// names of the different xattr entries | |
94 | #define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit" | |
95 | #define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count" | |
96 | #define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size" | |
97 | #define XATTR_SIZE "striper.size" | |
98 | #define LOCK_PREFIX "lock." | |
99 | ||
100 | /// name of the lock used on objects to ensure layout stability during IO | |
101 | #define RADOS_LOCK_NAME "striper.lock" | |
102 | ||
103 | /// format of the extension of rados objects created for a given striped object | |
104 | #define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx" | |
105 | ||
106 | /// default object layout | |
20effc67 TL |
107 | static const struct ceph_file_layout default_file_layout = { |
108 | ceph_le32(1<<22), // fl_stripe_unit | |
109 | ceph_le32(1), // fl_stripe_count | |
110 | ceph_le32(1<<22), // fl_object_size | |
111 | ceph_le32(0), // fl_cas_hash | |
112 | ceph_le32(0), // fl_object_stripe_unit | |
113 | ceph_le32(-1), // fl_unused | |
114 | ceph_le32(-1), // fl_pg_pool | |
7c673cae FG |
115 | }; |
116 | ||
20effc67 TL |
117 | using std::map; |
118 | using std::pair; | |
119 | using std::string; | |
120 | using std::vector; | |
224ce89b WB |
121 | using libradosstriper::MultiAioCompletionImplPtr; |
122 | ||
123 | namespace { | |
7c673cae FG |
124 | |
125 | ///////////////////////// CompletionData ///////////////////////////// | |
126 | ||
224ce89b WB |
127 | /** |
128 | * struct handling the data needed to pass to the call back | |
129 | * function in asynchronous operations | |
130 | */ | |
131 | struct CompletionData : RefCountedObject { | |
224ce89b WB |
132 | /// complete method |
133 | void complete(int r); | |
134 | /// striper to be used to handle the write completion | |
135 | libradosstriper::RadosStriperImpl *m_striper; | |
136 | /// striped object concerned by the write operation | |
137 | std::string m_soid; | |
138 | /// shared lock to be released at completion | |
139 | std::string m_lockCookie; | |
140 | /// completion handler | |
141 | librados::IoCtxImpl::C_aio_Complete *m_ack; | |
9f95a23c TL |
142 | protected: |
143 | CompletionData(libradosstriper::RadosStriperImpl * striper, | |
144 | const std::string& soid, | |
145 | const std::string& lockCookie, | |
146 | librados::AioCompletionImpl *userCompletion = 0); | |
147 | ~CompletionData() override; | |
148 | ||
224ce89b WB |
149 | }; |
150 | ||
151 | CompletionData::CompletionData | |
7c673cae FG |
152 | (libradosstriper::RadosStriperImpl* striper, |
153 | const std::string& soid, | |
154 | const std::string& lockCookie, | |
9f95a23c TL |
155 | librados::AioCompletionImpl *userCompletion) : |
156 | RefCountedObject(striper->cct()), | |
7c673cae FG |
157 | m_striper(striper), m_soid(soid), m_lockCookie(lockCookie), m_ack(0) { |
158 | m_striper->get(); | |
159 | if (userCompletion) { | |
160 | m_ack = new librados::IoCtxImpl::C_aio_Complete(userCompletion); | |
161 | userCompletion->io = striper->m_ioCtxImpl; | |
162 | } | |
163 | } | |
164 | ||
224ce89b | 165 | CompletionData::~CompletionData() { |
7c673cae FG |
166 | if (m_ack) delete m_ack; |
167 | m_striper->put(); | |
168 | } | |
169 | ||
224ce89b | 170 | void CompletionData::complete(int r) { |
7c673cae FG |
171 | if (m_ack) m_ack->finish(r); |
172 | } | |
173 | ||
224ce89b WB |
174 | /** |
175 | * struct handling the data needed to pass to the call back | |
176 | * function in asynchronous read operations | |
177 | */ | |
178 | struct ReadCompletionData : CompletionData { | |
179 | /// bufferlist containing final result | |
180 | bufferlist* m_bl; | |
181 | /// extents that will be read | |
182 | std::vector<ObjectExtent>* m_extents; | |
183 | /// intermediate results | |
184 | std::vector<bufferlist>* m_resultbl; | |
185 | /// return code of read completion, to be remembered until unlocking happened | |
186 | int m_readRc; | |
187 | /// completion object for the unlocking of the striped object at the end of the read | |
188 | librados::AioCompletion *m_unlockCompletion; | |
9f95a23c TL |
189 | /// complete method for when reading is over |
190 | void complete_read(int r); | |
191 | /// complete method for when object is unlocked | |
192 | void complete_unlock(int r); | |
193 | ||
194 | private: | |
195 | FRIEND_MAKE_REF(ReadCompletionData); | |
224ce89b WB |
196 | ReadCompletionData(libradosstriper::RadosStriperImpl * striper, |
197 | const std::string& soid, | |
198 | const std::string& lockCookie, | |
199 | librados::AioCompletionImpl *userCompletion, | |
200 | bufferlist* bl, | |
201 | std::vector<ObjectExtent>* extents, | |
9f95a23c | 202 | std::vector<bufferlist>* resultbl); |
224ce89b | 203 | ~ReadCompletionData() override; |
224ce89b WB |
204 | }; |
205 | ||
206 | ReadCompletionData::ReadCompletionData | |
7c673cae FG |
207 | (libradosstriper::RadosStriperImpl* striper, |
208 | const std::string& soid, | |
209 | const std::string& lockCookie, | |
210 | librados::AioCompletionImpl *userCompletion, | |
211 | bufferlist* bl, | |
212 | std::vector<ObjectExtent>* extents, | |
9f95a23c TL |
213 | std::vector<bufferlist>* resultbl) : |
214 | CompletionData(striper, soid, lockCookie, userCompletion), | |
7c673cae FG |
215 | m_bl(bl), m_extents(extents), m_resultbl(resultbl), m_readRc(0), |
216 | m_unlockCompletion(0) {} | |
217 | ||
224ce89b | 218 | ReadCompletionData::~ReadCompletionData() { |
7c673cae FG |
219 | m_unlockCompletion->release(); |
220 | delete m_extents; | |
221 | delete m_resultbl; | |
222 | } | |
223 | ||
224ce89b | 224 | void ReadCompletionData::complete_read(int r) { |
7c673cae FG |
225 | // gather data into final buffer |
226 | Striper::StripedReadResult readResult; | |
227 | vector<bufferlist>::iterator bit = m_resultbl->begin(); | |
228 | for (vector<ObjectExtent>::iterator eit = m_extents->begin(); | |
229 | eit != m_extents->end(); | |
230 | ++eit, ++bit) { | |
231 | readResult.add_partial_result(m_striper->cct(), *bit, eit->buffer_extents); | |
232 | } | |
233 | m_bl->clear(); | |
234 | readResult.assemble_result(m_striper->cct(), *m_bl, true); | |
235 | // Remember return code | |
236 | m_readRc = r; | |
237 | } | |
238 | ||
224ce89b | 239 | void ReadCompletionData::complete_unlock(int r) { |
7c673cae FG |
240 | // call parent's completion method |
241 | // Note that we ignore the return code of the unlock as we cannot do much about it | |
242 | CompletionData::complete(m_readRc?m_readRc:m_bl->length()); | |
243 | } | |
244 | ||
224ce89b WB |
245 | /** |
246 | * struct handling the data needed to pass to the call back | |
247 | * function in asynchronous write operations | |
248 | */ | |
249 | struct WriteCompletionData : CompletionData { | |
250 | /// safe completion handler | |
251 | librados::IoCtxImpl::C_aio_Complete *m_safe; | |
224ce89b WB |
252 | /// completion object for the unlocking of the striped object at the end of the write |
253 | librados::AioCompletion *m_unlockCompletion; | |
11fdf7f2 TL |
254 | /// return code of write completion, to be remembered until unlocking happened |
255 | int m_writeRc; | |
224ce89b WB |
256 | /// complete method for when writing is over |
257 | void complete_write(int r); | |
258 | /// complete method for when object is unlocked | |
259 | void complete_unlock(int r); | |
260 | /// safe method | |
261 | void safe(int r); | |
9f95a23c TL |
262 | private: |
263 | FRIEND_MAKE_REF(WriteCompletionData); | |
264 | /// constructor | |
265 | WriteCompletionData(libradosstriper::RadosStriperImpl * striper, | |
266 | const std::string& soid, | |
267 | const std::string& lockCookie, | |
268 | librados::AioCompletionImpl *userCompletion); | |
269 | /// destructor | |
270 | ~WriteCompletionData() override; | |
224ce89b WB |
271 | }; |
272 | ||
273 | WriteCompletionData::WriteCompletionData | |
7c673cae FG |
274 | (libradosstriper::RadosStriperImpl* striper, |
275 | const std::string& soid, | |
276 | const std::string& lockCookie, | |
9f95a23c TL |
277 | librados::AioCompletionImpl *userCompletion) : |
278 | CompletionData(striper, soid, lockCookie, userCompletion), | |
279 | m_safe(0), m_unlockCompletion(0), m_writeRc(0) { | |
7c673cae FG |
280 | if (userCompletion) { |
281 | m_safe = new librados::IoCtxImpl::C_aio_Complete(userCompletion); | |
282 | } | |
283 | } | |
284 | ||
224ce89b | 285 | WriteCompletionData::~WriteCompletionData() { |
7c673cae FG |
286 | m_unlockCompletion->release(); |
287 | if (m_safe) delete m_safe; | |
288 | } | |
289 | ||
224ce89b | 290 | void WriteCompletionData::complete_unlock(int r) { |
7c673cae FG |
291 | // call parent's completion method |
292 | // Note that we ignore the return code of the unlock as we cannot do much about it | |
293 | CompletionData::complete(m_writeRc); | |
294 | } | |
295 | ||
224ce89b | 296 | void WriteCompletionData::complete_write(int r) { |
7c673cae FG |
297 | // Remember return code |
298 | m_writeRc = r; | |
299 | } | |
300 | ||
224ce89b | 301 | void WriteCompletionData::safe(int r) { |
7c673cae FG |
302 | if (m_safe) m_safe->finish(r); |
303 | } | |
304 | ||
224ce89b WB |
305 | struct RemoveCompletionData : CompletionData { |
306 | /// removal flags | |
307 | int flags; | |
9f95a23c TL |
308 | |
309 | private: | |
310 | FRIEND_MAKE_REF(RemoveCompletionData); | |
224ce89b WB |
311 | /** |
312 | * constructor | |
313 | * note that the constructed object will take ownership of the lock | |
314 | */ | |
315 | RemoveCompletionData(libradosstriper::RadosStriperImpl * striper, | |
316 | const std::string& soid, | |
317 | const std::string& lockCookie, | |
318 | librados::AioCompletionImpl *userCompletion, | |
319 | int flags = 0) : | |
7c673cae | 320 | CompletionData(striper, soid, lockCookie, userCompletion), flags(flags) {} |
224ce89b | 321 | }; |
7c673cae | 322 | |
224ce89b WB |
323 | /** |
324 | * struct handling the data needed to pass to the call back | |
325 | * function in asynchronous truncate operations | |
326 | */ | |
327 | struct TruncateCompletionData : RefCountedObject { | |
9f95a23c TL |
328 | /// striper to be used |
329 | libradosstriper::RadosStriperImpl *m_striper; | |
330 | /// striped object concerned by the truncate operation | |
331 | std::string m_soid; | |
332 | /// the final size of the truncated object | |
333 | uint64_t m_size; | |
334 | ||
335 | private: | |
336 | FRIEND_MAKE_REF(TruncateCompletionData); | |
224ce89b WB |
337 | /// constructor |
338 | TruncateCompletionData(libradosstriper::RadosStriperImpl* striper, | |
339 | const std::string& soid, | |
340 | uint64_t size) : | |
341 | RefCountedObject(striper->cct()), | |
342 | m_striper(striper), m_soid(soid), m_size(size) { | |
343 | m_striper->get(); | |
344 | } | |
345 | /// destructor | |
346 | ~TruncateCompletionData() override { | |
347 | m_striper->put(); | |
348 | } | |
224ce89b | 349 | }; |
7c673cae | 350 | |
224ce89b WB |
351 | /** |
352 | * struct handling the data needed to pass to the call back | |
353 | * function in asynchronous read operations of a Rados File | |
354 | */ | |
355 | struct RadosReadCompletionData : RefCountedObject { | |
224ce89b WB |
356 | /// the multi asynch io completion object to be used |
357 | MultiAioCompletionImplPtr m_multiAioCompl; | |
358 | /// the expected number of bytes | |
359 | uint64_t m_expectedBytes; | |
360 | /// the bufferlist object where data have been written | |
361 | bufferlist *m_bl; | |
9f95a23c TL |
362 | |
363 | private: | |
364 | FRIEND_MAKE_REF(RadosReadCompletionData); | |
365 | /// constructor | |
366 | RadosReadCompletionData(MultiAioCompletionImplPtr multiAioCompl, | |
367 | uint64_t expectedBytes, | |
368 | bufferlist *bl, | |
369 | CephContext *context) : | |
370 | RefCountedObject(context), | |
371 | m_multiAioCompl(multiAioCompl), m_expectedBytes(expectedBytes), m_bl(bl) {} | |
224ce89b WB |
372 | }; |
373 | ||
374 | /** | |
375 | * struct handling (most of) the data needed to pass to the call back | |
376 | * function in asynchronous stat operations. | |
377 | * Inherited by the actual type for adding time information in different | |
378 | * versions (time_t or struct timespec) | |
379 | */ | |
380 | struct BasicStatCompletionData : CompletionData { | |
224ce89b WB |
381 | // MultiAioCompletionImpl used to handle the double aysnc |
382 | // call in the back (stat + getxattr) | |
383 | libradosstriper::MultiAioCompletionImpl *m_multiCompletion; | |
384 | // where to store the size of first objct | |
385 | // this will be ignored but we need a place to store it when | |
386 | // async stat is called | |
387 | uint64_t m_objectSize; | |
388 | // where to store the file size | |
389 | uint64_t *m_psize; | |
390 | /// the bufferlist object used for the getxattr call | |
391 | bufferlist m_bl; | |
392 | /// return code of the stat | |
393 | int m_statRC; | |
394 | /// return code of the getxattr | |
395 | int m_getxattrRC; | |
9f95a23c TL |
396 | |
397 | protected: | |
398 | /// constructor | |
399 | BasicStatCompletionData(libradosstriper::RadosStriperImpl* striper, | |
400 | const std::string& soid, | |
401 | librados::AioCompletionImpl *userCompletion, | |
402 | libradosstriper::MultiAioCompletionImpl *multiCompletion, | |
403 | uint64_t *psize) : | |
404 | CompletionData(striper, soid, "", userCompletion), | |
405 | m_multiCompletion(multiCompletion), m_psize(psize), | |
406 | m_statRC(0), m_getxattrRC(0) {}; | |
407 | ||
224ce89b WB |
408 | }; |
409 | ||
410 | /** | |
411 | * struct handling the data needed to pass to the call back | |
412 | * function in asynchronous stat operations. | |
413 | * Simple templated extension of BasicStatCompletionData. | |
414 | * The template parameter is the type of the time information | |
415 | * (used with time_t for stat and struct timespec for stat2) | |
416 | */ | |
417 | template<class TimeType> | |
418 | struct StatCompletionData : BasicStatCompletionData { | |
9f95a23c TL |
419 | // where to store the file time |
420 | TimeType *m_pmtime; | |
421 | private: | |
422 | FRIEND_MAKE_REF(StatCompletionData); | |
224ce89b | 423 | /// constructor |
9f95a23c | 424 | StatCompletionData<TimeType>(libradosstriper::RadosStriperImpl* striper, |
224ce89b WB |
425 | const std::string& soid, |
426 | librados::AioCompletionImpl *userCompletion, | |
427 | libradosstriper::MultiAioCompletionImpl *multiCompletion, | |
428 | uint64_t *psize, | |
9f95a23c TL |
429 | TimeType *pmtime) : |
430 | BasicStatCompletionData(striper, soid, userCompletion, multiCompletion, psize), | |
224ce89b | 431 | m_pmtime(pmtime) {}; |
224ce89b WB |
432 | }; |
433 | ||
434 | /** | |
435 | * struct handling the data needed to pass to the call back | |
436 | * function in asynchronous remove operations of a Rados File | |
437 | */ | |
438 | struct RadosRemoveCompletionData : RefCountedObject { | |
9f95a23c TL |
439 | /// the multi asynch io completion object to be used |
440 | MultiAioCompletionImplPtr m_multiAioCompl; | |
441 | private: | |
442 | FRIEND_MAKE_REF(RadosRemoveCompletionData); | |
224ce89b WB |
443 | /// constructor |
444 | RadosRemoveCompletionData(MultiAioCompletionImplPtr multiAioCompl, | |
445 | CephContext *context) : | |
9f95a23c | 446 | RefCountedObject(context), |
224ce89b | 447 | m_multiAioCompl(multiAioCompl) {}; |
224ce89b WB |
448 | }; |
449 | ||
450 | ||
451 | } // namespace { | |
7c673cae FG |
452 | |
453 | ///////////////////////// constructor ///////////////////////////// | |
454 | ||
455 | libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx& ioctx, librados::IoCtxImpl *ioctx_impl) : | |
9f95a23c | 456 | m_refCnt(0), m_radosCluster(ioctx), m_ioCtx(ioctx), m_ioCtxImpl(ioctx_impl), |
7c673cae FG |
457 | m_layout(default_file_layout) {} |
458 | ||
459 | ///////////////////////// layout ///////////////////////////// | |
460 | ||
461 | int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit | |
462 | (unsigned int stripe_unit) | |
463 | { | |
464 | /* stripe unit must be non-zero, 64k increment */ | |
465 | if (!stripe_unit || (stripe_unit & (CEPH_MIN_STRIPE_UNIT-1))) | |
466 | return -EINVAL; | |
467 | m_layout.fl_stripe_unit = stripe_unit; | |
468 | return 0; | |
469 | } | |
470 | ||
471 | int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount | |
472 | (unsigned int stripe_count) | |
473 | { | |
474 | /* stripe count must be non-zero */ | |
475 | if (!stripe_count) | |
476 | return -EINVAL; | |
477 | m_layout.fl_stripe_count = stripe_count; | |
478 | return 0; | |
479 | } | |
480 | ||
481 | int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize | |
482 | (unsigned int object_size) | |
483 | { | |
484 | /* object size must be non-zero, 64k increment */ | |
485 | if (!object_size || (object_size & (CEPH_MIN_STRIPE_UNIT-1))) | |
486 | return -EINVAL; | |
487 | /* object size must be a multiple of stripe unit */ | |
488 | if (object_size < m_layout.fl_stripe_unit || | |
489 | object_size % m_layout.fl_stripe_unit) | |
490 | return -EINVAL; | |
491 | m_layout.fl_object_size = object_size; | |
492 | return 0; | |
493 | } | |
494 | ||
495 | ///////////////////////// xattrs ///////////////////////////// | |
496 | ||
497 | int libradosstriper::RadosStriperImpl::getxattr(const object_t& soid, | |
498 | const char *name, | |
499 | bufferlist& bl) | |
500 | { | |
501 | std::string firstObjOid = getObjectId(soid, 0); | |
502 | return m_ioCtx.getxattr(firstObjOid, name, bl); | |
503 | } | |
504 | ||
505 | int libradosstriper::RadosStriperImpl::setxattr(const object_t& soid, | |
506 | const char *name, | |
507 | bufferlist& bl) | |
508 | { | |
509 | std::string firstObjOid = getObjectId(soid, 0); | |
510 | return m_ioCtx.setxattr(firstObjOid, name, bl); | |
511 | } | |
512 | ||
513 | int libradosstriper::RadosStriperImpl::getxattrs(const object_t& soid, | |
514 | map<string, bufferlist>& attrset) | |
515 | { | |
516 | std::string firstObjOid = getObjectId(soid, 0); | |
517 | int rc = m_ioCtx.getxattrs(firstObjOid, attrset); | |
518 | if (rc) return rc; | |
519 | // cleanup internal attributes dedicated to striping and locking | |
520 | attrset.erase(XATTR_LAYOUT_STRIPE_UNIT); | |
521 | attrset.erase(XATTR_LAYOUT_STRIPE_COUNT); | |
522 | attrset.erase(XATTR_LAYOUT_OBJECT_SIZE); | |
523 | attrset.erase(XATTR_SIZE); | |
524 | attrset.erase(std::string(LOCK_PREFIX) + RADOS_LOCK_NAME); | |
525 | return rc; | |
526 | } | |
527 | ||
528 | int libradosstriper::RadosStriperImpl::rmxattr(const object_t& soid, | |
529 | const char *name) | |
530 | { | |
531 | std::string firstObjOid = getObjectId(soid, 0); | |
532 | return m_ioCtx.rmxattr(firstObjOid, name); | |
533 | } | |
534 | ||
535 | ///////////////////////// io ///////////////////////////// | |
536 | ||
537 | int libradosstriper::RadosStriperImpl::write(const std::string& soid, | |
538 | const bufferlist& bl, | |
539 | size_t len, | |
540 | uint64_t off) | |
541 | { | |
542 | // open the object. This will create it if needed, retrieve its layout | |
543 | // and size and take a shared lock on it | |
544 | ceph_file_layout layout; | |
545 | std::string lockCookie; | |
546 | int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true); | |
547 | if (rc) return rc; | |
548 | return write_in_open_object(soid, layout, lockCookie, bl, len, off); | |
549 | } | |
550 | ||
551 | int libradosstriper::RadosStriperImpl::append(const std::string& soid, | |
552 | const bufferlist& bl, | |
553 | size_t len) | |
554 | { | |
555 | // open the object. This will create it if needed, retrieve its layout | |
556 | // and size and take a shared lock on it | |
557 | ceph_file_layout layout; | |
558 | uint64_t size = len; | |
559 | std::string lockCookie; | |
560 | int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false); | |
561 | if (rc) return rc; | |
562 | return write_in_open_object(soid, layout, lockCookie, bl, len, size); | |
563 | } | |
564 | ||
565 | int libradosstriper::RadosStriperImpl::write_full(const std::string& soid, | |
566 | const bufferlist& bl) | |
567 | { | |
568 | int rc = trunc(soid, 0); | |
569 | if (rc && rc != -ENOENT) return rc; // ENOENT is obviously ok | |
570 | return write(soid, bl, bl.length(), 0); | |
571 | } | |
572 | ||
573 | int libradosstriper::RadosStriperImpl::read(const std::string& soid, | |
574 | bufferlist* bl, | |
575 | size_t len, | |
576 | uint64_t off) | |
577 | { | |
578 | // create a completion object | |
579 | librados::AioCompletionImpl c; | |
580 | // call asynchronous method | |
581 | int rc = aio_read(soid, &c, bl, len, off); | |
582 | // and wait for completion | |
583 | if (!rc) { | |
584 | // wait for completion | |
585 | c.wait_for_complete_and_cb(); | |
586 | // return result | |
587 | rc = c.get_return_value(); | |
588 | } | |
589 | return rc; | |
590 | } | |
591 | ||
592 | ///////////////////////// asynchronous io ///////////////////////////// | |
593 | ||
594 | int libradosstriper::RadosStriperImpl::aio_write(const std::string& soid, | |
595 | librados::AioCompletionImpl *c, | |
596 | const bufferlist& bl, | |
597 | size_t len, | |
598 | uint64_t off) | |
599 | { | |
600 | ceph_file_layout layout; | |
601 | std::string lockCookie; | |
602 | int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true); | |
603 | if (rc) return rc; | |
604 | return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, off); | |
605 | } | |
606 | ||
607 | int libradosstriper::RadosStriperImpl::aio_append(const std::string& soid, | |
608 | librados::AioCompletionImpl *c, | |
609 | const bufferlist& bl, | |
610 | size_t len) | |
611 | { | |
612 | ceph_file_layout layout; | |
613 | uint64_t size = len; | |
614 | std::string lockCookie; | |
615 | int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false); | |
616 | if (rc) return rc; | |
617 | // create a completion object | |
618 | return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, size); | |
619 | } | |
620 | ||
621 | int libradosstriper::RadosStriperImpl::aio_write_full(const std::string& soid, | |
622 | librados::AioCompletionImpl *c, | |
623 | const bufferlist& bl) | |
624 | { | |
625 | int rc = trunc(soid, 0); | |
626 | if (rc) return rc; | |
627 | return aio_write(soid, c, bl, bl.length(), 0); | |
628 | } | |
629 | ||
630 | static void rados_read_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg) | |
631 | { | |
9f95a23c | 632 | auto cdata = ceph::ref_t<ReadCompletionData>(static_cast<ReadCompletionData*>(arg), false); |
7c673cae FG |
633 | libradosstriper::MultiAioCompletionImpl *comp = |
634 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
635 | cdata->complete_unlock(comp->rval); | |
7c673cae FG |
636 | } |
637 | ||
638 | static void striper_read_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
639 | { | |
9f95a23c | 640 | auto cdata = static_cast<ReadCompletionData*>(arg); |
7c673cae FG |
641 | // launch the async unlocking of the object |
642 | cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion); | |
643 | // complete the read part in parallel | |
644 | libradosstriper::MultiAioCompletionImpl *comp = | |
645 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
646 | cdata->complete_read(comp->rval); | |
647 | } | |
648 | ||
7c673cae FG |
649 | static void rados_req_read_complete(rados_completion_t c, void *arg) |
650 | { | |
9f95a23c | 651 | auto data = static_cast<RadosReadCompletionData*>(arg); |
7c673cae FG |
652 | int rc = rados_aio_get_return_value(c); |
653 | // We need to handle the case of sparse files here | |
654 | if (rc == -ENOENT) { | |
655 | // the object did not exist at all. This can happen for sparse files. | |
656 | // we consider we've read 0 bytes and it will fall into next case | |
657 | rc = 0; | |
658 | } | |
9f95a23c | 659 | ssize_t nread = rc; |
7c673cae FG |
660 | if (rc >= 0 && (((uint64_t)rc) < data->m_expectedBytes)) { |
661 | // only partial data were present in the object (or the object did not | |
662 | // even exist if we've gone through previous case). | |
663 | // This is typical of sparse file and we need to complete with 0s. | |
664 | unsigned int lenOfZeros = data->m_expectedBytes-rc; | |
20effc67 | 665 | unsigned int existingDataToZero = std::min(data->m_bl->length()-rc, lenOfZeros); |
7c673cae FG |
666 | if (existingDataToZero > 0) { |
667 | data->m_bl->zero(rc, existingDataToZero); | |
668 | } | |
669 | if (lenOfZeros > existingDataToZero) { | |
670 | ceph::bufferptr zeros(ceph::buffer::create(lenOfZeros-existingDataToZero)); | |
671 | zeros.zero(); | |
672 | data->m_bl->push_back(zeros); | |
673 | } | |
9f95a23c | 674 | nread = data->m_expectedBytes; |
7c673cae | 675 | } |
9f95a23c TL |
676 | auto multi_aio_comp = data->m_multiAioCompl; |
677 | multi_aio_comp->complete_request(nread); | |
678 | multi_aio_comp->safe_request(rc); | |
7c673cae FG |
679 | } |
680 | ||
681 | int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid, | |
682 | librados::AioCompletionImpl *c, | |
683 | bufferlist* bl, | |
684 | size_t len, | |
685 | uint64_t off) | |
686 | { | |
687 | // open the object. This will retrieve its layout and size | |
688 | // and take a shared lock on it | |
689 | ceph_file_layout layout; | |
690 | uint64_t size; | |
691 | std::string lockCookie; | |
692 | int rc = openStripedObjectForRead(soid, &layout, &size, &lockCookie); | |
693 | if (rc) return rc; | |
694 | // find out the actual number of bytes we can read | |
695 | uint64_t read_len; | |
696 | if (off >= size) { | |
697 | // nothing to read ! We are done. | |
698 | read_len = 0; | |
699 | } else { | |
20effc67 | 700 | read_len = std::min(len, (size_t)(size-off)); |
7c673cae FG |
701 | } |
702 | // get list of extents to be read from | |
703 | vector<ObjectExtent> *extents = new vector<ObjectExtent>(); | |
704 | if (read_len > 0) { | |
c07f9fc5 FG |
705 | std::string format = soid; |
706 | boost::replace_all(format, "%", "%%"); | |
707 | format += RADOS_OBJECT_EXTENSION_FORMAT; | |
7c673cae FG |
708 | file_layout_t l; |
709 | l.from_legacy(layout); | |
710 | Striper::file_to_extents(cct(), format.c_str(), &l, off, read_len, | |
711 | 0, *extents); | |
712 | } | |
713 | ||
714 | // create a completion object and transfer ownership of extents and resultbl | |
715 | vector<bufferlist> *resultbl = new vector<bufferlist>(extents->size()); | |
9f95a23c | 716 | auto cdata = ceph::make_ref<ReadCompletionData>(this, soid, lockCookie, c, bl, extents, resultbl); |
7c673cae FG |
717 | c->is_read = true; |
718 | c->io = m_ioCtxImpl; | |
719 | // create a completion for the unlocking of the striped object at the end of the read | |
720 | librados::AioCompletion *unlock_completion = | |
9f95a23c | 721 | librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_read_aio_unlock_complete); |
7c673cae FG |
722 | cdata->m_unlockCompletion = unlock_completion; |
723 | // create the multiCompletion object handling the reads | |
224ce89b WB |
724 | MultiAioCompletionImplPtr nc{new libradosstriper::MultiAioCompletionImpl, |
725 | false}; | |
9f95a23c | 726 | nc->set_complete_callback(cdata.get(), striper_read_aio_req_complete); |
7c673cae FG |
727 | // go through the extents |
728 | int r = 0, i = 0; | |
729 | for (vector<ObjectExtent>::iterator p = extents->begin(); p != extents->end(); ++p) { | |
730 | // create a buffer list describing where to place data read from current extend | |
731 | bufferlist *oid_bl = &((*resultbl)[i++]); | |
732 | for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin(); | |
733 | q != p->buffer_extents.end(); | |
734 | ++q) { | |
735 | bufferlist buffer_bl; | |
736 | buffer_bl.substr_of(*bl, q->first, q->second); | |
737 | oid_bl->append(buffer_bl); | |
738 | } | |
739 | // read all extends of a given object in one go | |
740 | nc->add_request(); | |
741 | // we need 2 references on data as both rados_req_read_safe and rados_req_read_complete | |
742 | // will release one | |
9f95a23c | 743 | auto data = ceph::make_ref<RadosReadCompletionData>(nc, p->length, oid_bl, cct()); |
7c673cae | 744 | librados::AioCompletion *rados_completion = |
9f95a23c | 745 | librados::Rados::aio_create_completion(data.detach(), rados_req_read_complete); |
7c673cae FG |
746 | r = m_ioCtx.aio_read(p->oid.name, rados_completion, oid_bl, p->length, p->offset); |
747 | rados_completion->release(); | |
748 | if (r < 0) | |
749 | break; | |
750 | } | |
751 | nc->finish_adding_requests(); | |
7c673cae FG |
752 | return r; |
753 | } | |
754 | ||
755 | int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid, | |
756 | librados::AioCompletionImpl *c, | |
757 | char* buf, | |
758 | size_t len, | |
759 | uint64_t off) | |
760 | { | |
761 | // create a buffer list and store it inside the completion object | |
762 | c->bl.clear(); | |
763 | c->bl.push_back(buffer::create_static(len, buf)); | |
764 | // call the bufferlist version of this method | |
765 | return aio_read(soid, c, &c->bl, len, off); | |
766 | } | |
767 | ||
768 | int libradosstriper::RadosStriperImpl::aio_flush() | |
769 | { | |
770 | int ret; | |
771 | // pass to the rados level | |
772 | ret = m_ioCtx.aio_flush(); | |
773 | if (ret < 0) | |
774 | return ret; | |
775 | //wait all CompletionData are released | |
9f95a23c TL |
776 | std::unique_lock l{lock}; |
777 | cond.wait(l, [this] {return m_refCnt <= 1;}); | |
7c673cae FG |
778 | return ret; |
779 | } | |
780 | ||
781 | ///////////////////////// stat and deletion ///////////////////////////// | |
782 | ||
783 | int libradosstriper::RadosStriperImpl::stat(const std::string& soid, uint64_t *psize, time_t *pmtime) | |
784 | { | |
785 | // create a completion object | |
786 | librados::AioCompletionImpl c; | |
787 | // call asynchronous version of stat | |
788 | int rc = aio_stat(soid, &c, psize, pmtime); | |
789 | if (rc == 0) { | |
790 | // wait for completion of the remove | |
791 | c.wait_for_complete(); | |
792 | // get result | |
793 | rc = c.get_return_value(); | |
794 | } | |
795 | return rc; | |
796 | } | |
797 | ||
798 | static void striper_stat_aio_stat_complete(rados_completion_t c, void *arg) { | |
9f95a23c | 799 | auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false); |
7c673cae FG |
800 | int rc = rados_aio_get_return_value(c); |
801 | if (rc == -ENOENT) { | |
802 | // remember this has failed | |
803 | data->m_statRC = rc; | |
804 | } | |
805 | data->m_multiCompletion->complete_request(rc); | |
7c673cae FG |
806 | } |
807 | ||
808 | static void striper_stat_aio_getxattr_complete(rados_completion_t c, void *arg) { | |
9f95a23c | 809 | auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false); |
7c673cae FG |
810 | int rc = rados_aio_get_return_value(c); |
811 | // We need to handle the case of sparse files here | |
812 | if (rc < 0) { | |
813 | // remember this has failed | |
814 | data->m_getxattrRC = rc; | |
815 | } else { | |
816 | // this intermediate string allows to add a null terminator before calling strtol | |
817 | std::string err; | |
818 | std::string strsize(data->m_bl.c_str(), data->m_bl.length()); | |
819 | *data->m_psize = strict_strtoll(strsize.c_str(), 10, &err); | |
820 | if (!err.empty()) { | |
821 | lderr(data->m_striper->cct()) << XATTR_SIZE << " : " << err << dendl; | |
822 | data->m_getxattrRC = -EINVAL; | |
823 | } | |
824 | rc = 0; | |
825 | } | |
826 | data->m_multiCompletion->complete_request(rc); | |
7c673cae FG |
827 | } |
828 | ||
829 | static void striper_stat_aio_req_complete(rados_striper_multi_completion_t c, | |
830 | void *arg) { | |
9f95a23c | 831 | auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false); |
7c673cae FG |
832 | if (data->m_statRC) { |
833 | data->complete(data->m_statRC); | |
834 | } else { | |
835 | if (data->m_getxattrRC < 0) { | |
836 | data->complete(data->m_getxattrRC); | |
837 | } else { | |
838 | data->complete(0); | |
839 | } | |
840 | } | |
7c673cae FG |
841 | } |
842 | ||
843 | template<class TimeType> | |
844 | int libradosstriper::RadosStriperImpl::aio_generic_stat | |
845 | (const std::string& soid, | |
846 | librados::AioCompletionImpl *c, | |
847 | uint64_t *psize, | |
848 | TimeType *pmtime, | |
849 | typename libradosstriper::RadosStriperImpl::StatFunction<TimeType>::Type statFunction) | |
850 | { | |
851 | // use a MultiAioCompletion object for dealing with the fact | |
852 | // that we'll do 2 asynchronous calls in parallel | |
224ce89b WB |
853 | MultiAioCompletionImplPtr multi_completion{ |
854 | new libradosstriper::MultiAioCompletionImpl, false}; | |
7c673cae FG |
855 | // Data object used for passing context to asynchronous calls |
856 | std::string firstObjOid = getObjectId(soid, 0); | |
9f95a23c TL |
857 | auto cdata = ceph::make_ref<StatCompletionData<TimeType>>(this, firstObjOid, c, multi_completion.get(), psize, pmtime); |
858 | multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_stat_aio_req_complete); | |
7c673cae FG |
859 | // use a regular AioCompletion for the stat async call |
860 | librados::AioCompletion *stat_completion = | |
9f95a23c | 861 | librados::Rados::aio_create_completion(cdata->get() /* create ref! */, striper_stat_aio_stat_complete); |
7c673cae FG |
862 | multi_completion->add_safe_request(); |
863 | object_t obj(firstObjOid); | |
864 | int rc = (m_ioCtxImpl->*statFunction)(obj, stat_completion->pc, | |
865 | &cdata->m_objectSize, cdata->m_pmtime); | |
866 | stat_completion->release(); | |
867 | if (rc < 0) { | |
868 | // nothing is really started so cancel everything | |
9f95a23c | 869 | delete cdata.detach(); |
7c673cae FG |
870 | return rc; |
871 | } | |
872 | // use a regular AioCompletion for the getxattr async call | |
873 | librados::AioCompletion *getxattr_completion = | |
9f95a23c | 874 | librados::Rados::aio_create_completion(cdata->get() /* create ref! */, striper_stat_aio_getxattr_complete); |
7c673cae FG |
875 | multi_completion->add_safe_request(); |
876 | // in parallel, get the pmsize from the first object asynchronously | |
877 | rc = m_ioCtxImpl->aio_getxattr(obj, getxattr_completion->pc, | |
878 | XATTR_SIZE, cdata->m_bl); | |
879 | getxattr_completion->release(); | |
880 | multi_completion->finish_adding_requests(); | |
881 | if (rc < 0) { | |
882 | // the async stat is ongoing, so we need to go on | |
883 | // we mark the getxattr as failed in the data object | |
884 | cdata->m_getxattrRC = rc; | |
885 | multi_completion->complete_request(rc); | |
7c673cae FG |
886 | return rc; |
887 | } | |
7c673cae FG |
888 | return 0; |
889 | } | |
890 | ||
891 | int libradosstriper::RadosStriperImpl::aio_stat(const std::string& soid, | |
892 | librados::AioCompletionImpl *c, | |
893 | uint64_t *psize, | |
894 | time_t *pmtime) | |
895 | { | |
896 | return aio_generic_stat<time_t>(soid, c, psize, pmtime, &librados::IoCtxImpl::aio_stat); | |
897 | } | |
898 | ||
899 | int libradosstriper::RadosStriperImpl::stat2(const std::string& soid, uint64_t *psize, struct timespec *pts) | |
900 | { | |
901 | // create a completion object | |
902 | librados::AioCompletionImpl c; | |
903 | // call asynchronous version of stat | |
904 | int rc = aio_stat2(soid, &c, psize, pts); | |
905 | if (rc == 0) { | |
906 | // wait for completion of the remove | |
907 | c.wait_for_complete_and_cb(); | |
908 | // get result | |
909 | rc = c.get_return_value(); | |
910 | } | |
911 | return rc; | |
912 | } | |
913 | ||
914 | int libradosstriper::RadosStriperImpl::aio_stat2(const std::string& soid, | |
915 | librados::AioCompletionImpl *c, | |
916 | uint64_t *psize, | |
917 | struct timespec *pts) | |
918 | { | |
919 | return aio_generic_stat<struct timespec>(soid, c, psize, pts, &librados::IoCtxImpl::aio_stat2); | |
920 | } | |
921 | ||
922 | static void rados_req_remove_complete(rados_completion_t c, void *arg) | |
923 | { | |
9f95a23c | 924 | auto cdata = static_cast<RadosRemoveCompletionData*>(arg); |
7c673cae FG |
925 | int rc = rados_aio_get_return_value(c); |
926 | // in case the object did not exist, it means we had a sparse file, all is fine | |
927 | if (rc == -ENOENT) { | |
928 | rc = 0; | |
929 | } | |
930 | cdata->m_multiAioCompl->complete_request(rc); | |
7c673cae | 931 | cdata->m_multiAioCompl->safe_request(rc); |
7c673cae FG |
932 | } |
933 | ||
934 | static void striper_remove_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
935 | { | |
9f95a23c | 936 | auto cdata = ceph::ref_t<RemoveCompletionData>(static_cast<RemoveCompletionData*>(arg), false); |
7c673cae FG |
937 | libradosstriper::MultiAioCompletionImpl *comp = |
938 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
939 | ldout(cdata->m_striper->cct(), 10) | |
940 | << "RadosStriperImpl : striper_remove_aio_req_complete called for " | |
941 | << cdata->m_soid << dendl; | |
942 | int rc = comp->rval; | |
943 | if (rc == 0) { | |
944 | // All went fine, synchronously remove first object | |
945 | rc = cdata->m_striper->m_ioCtx.remove(cdata->m_striper->getObjectId(cdata->m_soid, 0), | |
946 | cdata->flags); | |
947 | } else { | |
948 | lderr(cdata->m_striper->cct()) | |
949 | << "RadosStriperImpl : deletion/truncation incomplete for " << cdata->m_soid | |
950 | << ", as errors were encountered. The file is left present but it's content " | |
951 | << " has been partially removed" | |
952 | << dendl; | |
953 | } | |
954 | cdata->complete(rc); | |
7c673cae FG |
955 | } |
956 | ||
957 | int libradosstriper::RadosStriperImpl::remove(const std::string& soid, int flags) | |
958 | { | |
959 | // create a completion object | |
960 | librados::AioCompletionImpl c; | |
961 | // call asynchronous version of remove | |
962 | int rc = aio_remove(soid, &c, flags); | |
963 | if (rc == 0) { | |
964 | // wait for completion of the remove | |
965 | c.wait_for_complete_and_cb(); | |
966 | // get result | |
967 | rc = c.get_return_value(); | |
968 | } | |
969 | return rc; | |
970 | } | |
971 | ||
972 | int libradosstriper::RadosStriperImpl::aio_remove(const std::string& soid, | |
973 | librados::AioCompletionImpl *c, | |
974 | int flags) | |
975 | { | |
976 | // the RemoveCompletionData object will lock the given soid for the duration | |
977 | // of the removal | |
978 | std::string lockCookie = getUUID(); | |
979 | int rc = m_ioCtx.lock_exclusive(getObjectId(soid, 0), RADOS_LOCK_NAME, lockCookie, "", 0, 0); | |
980 | if (rc) return rc; | |
981 | // create CompletionData for the async remove call | |
9f95a23c | 982 | auto cdata = ceph::make_ref<RemoveCompletionData>(this, soid, lockCookie, c, flags); |
224ce89b WB |
983 | MultiAioCompletionImplPtr multi_completion{ |
984 | new libradosstriper::MultiAioCompletionImpl, false}; | |
9f95a23c | 985 | multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_remove_aio_req_complete); |
7c673cae FG |
986 | // call asynchronous internal version of remove |
987 | ldout(cct(), 10) | |
988 | << "RadosStriperImpl : Aio_remove starting for " | |
989 | << soid << dendl; | |
990 | rc = internal_aio_remove(soid, multi_completion); | |
7c673cae FG |
991 | return rc; |
992 | } | |
993 | ||
224ce89b WB |
994 | int libradosstriper::RadosStriperImpl::internal_aio_remove( |
995 | const std::string& soid, | |
996 | MultiAioCompletionImplPtr multi_completion, | |
7c673cae FG |
997 | int flags) |
998 | { | |
999 | std::string firstObjOid = getObjectId(soid, 0); | |
1000 | try { | |
1001 | // check size and get number of rados objects to delete | |
1002 | uint64_t nb_objects = 0; | |
1003 | bufferlist bl2; | |
1004 | int rc = getxattr(soid, XATTR_SIZE, bl2); | |
1005 | if (rc < 0) { | |
1006 | // no object size (or not able to get it) | |
1007 | // try to find the number of object "by hand" | |
1008 | uint64_t psize; | |
1009 | time_t pmtime; | |
1010 | while (!m_ioCtx.stat(getObjectId(soid, nb_objects), &psize, &pmtime)) { | |
1011 | nb_objects++; | |
1012 | } | |
1013 | } else { | |
1014 | // count total number of rados objects in the striped object | |
1015 | std::string err; | |
1016 | // this intermediate string allows to add a null terminator before calling strtol | |
1017 | std::string strsize(bl2.c_str(), bl2.length()); | |
1018 | uint64_t size = strict_strtoll(strsize.c_str(), 10, &err); | |
1019 | if (!err.empty()) { | |
1020 | lderr(cct()) << XATTR_SIZE << " : " << err << dendl; | |
1021 | ||
1022 | return -EINVAL; | |
1023 | } | |
1024 | uint64_t object_size = m_layout.fl_object_size; | |
1025 | uint64_t su = m_layout.fl_stripe_unit; | |
1026 | uint64_t stripe_count = m_layout.fl_stripe_count; | |
1027 | uint64_t nb_complete_sets = size / (object_size*stripe_count); | |
1028 | uint64_t remaining_data = size % (object_size*stripe_count); | |
1029 | uint64_t remaining_stripe_units = (remaining_data + su -1) / su; | |
1030 | uint64_t remaining_objects = std::min(remaining_stripe_units, stripe_count); | |
1031 | nb_objects = nb_complete_sets * stripe_count + remaining_objects; | |
1032 | } | |
1033 | // delete rados objects in reverse order | |
1034 | // Note that we do not drop the first object. This one will only be dropped | |
1035 | // if all other removals have been successful, and this is done in the | |
1036 | // callback of the multi_completion object | |
1037 | int rcr = 0; | |
1038 | for (int i = nb_objects-1; i >= 1; i--) { | |
1039 | multi_completion->add_request(); | |
9f95a23c | 1040 | auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct()); |
7c673cae | 1041 | librados::AioCompletion *rados_completion = |
9f95a23c TL |
1042 | librados::Rados::aio_create_completion(data->get() /* create ref! */, |
1043 | rados_req_remove_complete); | |
7c673cae FG |
1044 | if (flags == 0) { |
1045 | rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion); | |
1046 | } else { | |
1047 | rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion, flags); | |
1048 | } | |
1049 | rados_completion->release(); | |
1050 | if (rcr < 0 and -ENOENT != rcr) { | |
1051 | lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid | |
1052 | << ", as " << getObjectId(soid, i) << " could not be deleted (rc=" << rc << ")" | |
1053 | << dendl; | |
1054 | break; | |
1055 | } | |
1056 | } | |
1057 | // we are over adding requests to the multi_completion object | |
1058 | multi_completion->finish_adding_requests(); | |
1059 | // return | |
1060 | return rcr; | |
1061 | } catch (ErrorCode &e) { | |
11fdf7f2 | 1062 | // error caught when trying to take the exclusive lock |
7c673cae FG |
1063 | return e.m_code; |
1064 | } | |
1065 | ||
1066 | } | |
1067 | ||
1068 | int libradosstriper::RadosStriperImpl::trunc(const std::string& soid, uint64_t size) | |
1069 | { | |
1070 | // lock the object in exclusive mode | |
1071 | std::string firstObjOid = getObjectId(soid, 0); | |
1072 | librados::ObjectWriteOperation op; | |
1073 | op.assert_exists(); | |
1074 | std::string lockCookie = RadosStriperImpl::getUUID(); | |
1075 | utime_t dur = utime_t(); | |
f67539c2 | 1076 | rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::EXCLUSIVE, lockCookie, "", "", dur, 0); |
7c673cae FG |
1077 | int rc = m_ioCtx.operate(firstObjOid, &op); |
1078 | if (rc) return rc; | |
1079 | // load layout and size | |
1080 | ceph_file_layout layout; | |
1081 | uint64_t original_size; | |
1082 | rc = internal_get_layout_and_size(firstObjOid, &layout, &original_size); | |
1083 | if (!rc) { | |
1084 | if (size < original_size) { | |
1085 | rc = truncate(soid, original_size, size, layout); | |
1086 | } else if (size > original_size) { | |
1087 | rc = grow(soid, original_size, size, layout); | |
1088 | } | |
1089 | } | |
1090 | // unlock object, ignore return code as we cannot do much | |
1091 | m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie); | |
1092 | // final return | |
1093 | return rc; | |
1094 | } | |
1095 | ||
1096 | ||
1097 | ///////////////////////// private helpers ///////////////////////////// | |
1098 | ||
1099 | std::string libradosstriper::RadosStriperImpl::getObjectId(const object_t& soid, | |
1100 | long long unsigned objectno) | |
1101 | { | |
1102 | std::ostringstream s; | |
1103 | s << soid << '.' << std::setfill ('0') << std::setw(16) << std::hex << objectno; | |
1104 | return s.str(); | |
1105 | } | |
1106 | ||
1107 | void libradosstriper::RadosStriperImpl::unlockObject(const std::string& soid, | |
1108 | const std::string& lockCookie) | |
1109 | { | |
1110 | // unlock the shared lock on the first rados object | |
1111 | std::string firstObjOid = getObjectId(soid, 0); | |
1112 | m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie); | |
1113 | } | |
1114 | ||
1115 | void libradosstriper::RadosStriperImpl::aio_unlockObject(const std::string& soid, | |
1116 | const std::string& lockCookie, | |
1117 | librados::AioCompletion *c) | |
1118 | { | |
1119 | // unlock the shared lock on the first rados object | |
1120 | std::string firstObjOid = getObjectId(soid, 0); | |
1121 | m_ioCtx.aio_unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie, c); | |
1122 | } | |
1123 | ||
1124 | static void rados_write_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg) | |
1125 | { | |
9f95a23c | 1126 | auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false); |
7c673cae FG |
1127 | libradosstriper::MultiAioCompletionImpl *comp = |
1128 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
1129 | cdata->complete_unlock(comp->rval); | |
7c673cae FG |
1130 | } |
1131 | ||
1132 | static void striper_write_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
1133 | { | |
9f95a23c | 1134 | auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false); |
7c673cae FG |
1135 | // launch the async unlocking of the object |
1136 | cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion); | |
1137 | // complete the write part in parallel | |
1138 | libradosstriper::MultiAioCompletionImpl *comp = | |
1139 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
1140 | cdata->complete_write(comp->rval); | |
7c673cae FG |
1141 | } |
1142 | ||
1143 | static void striper_write_aio_req_safe(rados_striper_multi_completion_t c, void *arg) | |
1144 | { | |
9f95a23c | 1145 | auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false); |
7c673cae FG |
1146 | libradosstriper::MultiAioCompletionImpl *comp = |
1147 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
1148 | cdata->safe(comp->rval); | |
7c673cae FG |
1149 | } |
1150 | ||
1151 | int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string& soid, | |
1152 | const ceph_file_layout& layout, | |
1153 | const std::string& lockCookie, | |
1154 | const bufferlist& bl, | |
1155 | size_t len, | |
1156 | uint64_t off) { | |
1157 | // create a completion object to be passed to the callbacks of the multicompletion | |
1158 | // we need 3 references as striper_write_aio_req_complete will release two and | |
1159 | // striper_write_aio_req_safe will release one | |
9f95a23c | 1160 | auto cdata = ceph::make_ref<WriteCompletionData>(this, soid, lockCookie, nullptr); |
7c673cae FG |
1161 | // create a completion object for the unlocking of the striped object at the end of the write |
1162 | librados::AioCompletion *unlock_completion = | |
9f95a23c | 1163 | librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_write_aio_unlock_complete); |
7c673cae FG |
1164 | cdata->m_unlockCompletion = unlock_completion; |
1165 | // create the multicompletion that will handle the write completion | |
224ce89b WB |
1166 | MultiAioCompletionImplPtr c{new libradosstriper::MultiAioCompletionImpl, |
1167 | false}; | |
9f95a23c TL |
1168 | c->set_complete_callback(cdata->get() /* create ref! */, striper_write_aio_req_complete); |
1169 | c->set_safe_callback(cdata->get() /* create ref! */, striper_write_aio_req_safe); | |
7c673cae FG |
1170 | // call the asynchronous API |
1171 | int rc = internal_aio_write(soid, c, bl, len, off, layout); | |
1172 | if (!rc) { | |
1173 | // wait for completion and safety of data | |
1174 | c->wait_for_complete_and_cb(); | |
1175 | c->wait_for_safe_and_cb(); | |
1176 | // wait for the unlocking | |
1177 | unlock_completion->wait_for_complete(); | |
1178 | // return result | |
1179 | rc = c->get_return_value(); | |
1180 | } | |
7c673cae FG |
1181 | return rc; |
1182 | } | |
1183 | ||
1184 | int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string& soid, | |
1185 | librados::AioCompletionImpl *c, | |
1186 | const ceph_file_layout& layout, | |
1187 | const std::string& lockCookie, | |
1188 | const bufferlist& bl, | |
1189 | size_t len, | |
1190 | uint64_t off) { | |
1191 | // create a completion object to be passed to the callbacks of the multicompletion | |
1192 | // we need 3 references as striper_write_aio_req_complete will release two and | |
1193 | // striper_write_aio_req_safe will release one | |
9f95a23c | 1194 | auto cdata = ceph::make_ref<WriteCompletionData>(this, soid, lockCookie, c); |
7c673cae FG |
1195 | m_ioCtxImpl->get(); |
1196 | c->io = m_ioCtxImpl; | |
1197 | // create a completion object for the unlocking of the striped object at the end of the write | |
1198 | librados::AioCompletion *unlock_completion = | |
9f95a23c | 1199 | librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_write_aio_unlock_complete); |
7c673cae FG |
1200 | cdata->m_unlockCompletion = unlock_completion; |
1201 | // create the multicompletion that will handle the write completion | |
224ce89b WB |
1202 | libradosstriper::MultiAioCompletionImplPtr nc{ |
1203 | new libradosstriper::MultiAioCompletionImpl, false}; | |
9f95a23c TL |
1204 | nc->set_complete_callback(cdata->get() /* create ref! */, striper_write_aio_req_complete); |
1205 | nc->set_safe_callback(cdata->get() /* create ref! */, striper_write_aio_req_safe); | |
7c673cae FG |
1206 | // internal asynchronous API |
1207 | int rc = internal_aio_write(soid, nc, bl, len, off, layout); | |
7c673cae FG |
1208 | return rc; |
1209 | } | |
1210 | ||
7c673cae FG |
1211 | static void rados_req_write_complete(rados_completion_t c, void *arg) |
1212 | { | |
9f95a23c | 1213 | auto comp = reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg); |
7c673cae | 1214 | comp->complete_request(rados_aio_get_return_value(c)); |
9f95a23c | 1215 | comp->safe_request(rados_aio_get_return_value(c)); |
7c673cae FG |
1216 | } |
1217 | ||
1218 | int | |
1219 | libradosstriper::RadosStriperImpl::internal_aio_write(const std::string& soid, | |
224ce89b | 1220 | libradosstriper::MultiAioCompletionImplPtr c, |
7c673cae FG |
1221 | const bufferlist& bl, |
1222 | size_t len, | |
1223 | uint64_t off, | |
1224 | const ceph_file_layout& layout) | |
1225 | { | |
1226 | int r = 0; | |
1227 | // Do not try anything if we are called with empty buffer, | |
1228 | // file_to_extents would raise an exception | |
1229 | if (len > 0) { | |
1230 | // get list of extents to be written to | |
1231 | vector<ObjectExtent> extents; | |
c07f9fc5 FG |
1232 | std::string format = soid; |
1233 | boost::replace_all(format, "%", "%%"); | |
1234 | format += RADOS_OBJECT_EXTENSION_FORMAT; | |
7c673cae FG |
1235 | file_layout_t l; |
1236 | l.from_legacy(layout); | |
1237 | Striper::file_to_extents(cct(), format.c_str(), &l, off, len, 0, extents); | |
1238 | // go through the extents | |
1239 | for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) { | |
1240 | // assemble pieces of a given object into a single buffer list | |
1241 | bufferlist oid_bl; | |
1242 | for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin(); | |
1243 | q != p->buffer_extents.end(); | |
1244 | ++q) { | |
1245 | bufferlist buffer_bl; | |
1246 | buffer_bl.substr_of(bl, q->first, q->second); | |
1247 | oid_bl.append(buffer_bl); | |
1248 | } | |
1249 | // and write the object | |
1250 | c->add_request(); | |
1251 | librados::AioCompletion *rados_completion = | |
224ce89b | 1252 | librados::Rados::aio_create_completion(c.get(), |
9f95a23c | 1253 | rados_req_write_complete); |
224ce89b WB |
1254 | r = m_ioCtx.aio_write(p->oid.name, rados_completion, oid_bl, |
1255 | p->length, p->offset); | |
7c673cae FG |
1256 | rados_completion->release(); |
1257 | if (r < 0) | |
1258 | break; | |
1259 | } | |
1260 | } | |
1261 | c->finish_adding_requests(); | |
1262 | return r; | |
1263 | } | |
1264 | ||
1265 | int libradosstriper::RadosStriperImpl::extract_uint32_attr | |
1266 | (std::map<std::string, bufferlist> &attrs, | |
1267 | const std::string& key, | |
1268 | ceph_le32 *value) | |
1269 | { | |
1270 | std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key); | |
1271 | if (attrsIt != attrs.end()) { | |
1272 | // this intermediate string allows to add a null terminator before calling strtol | |
1273 | std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length()); | |
1274 | std::string err; | |
1275 | *value = strict_strtol(strvalue.c_str(), 10, &err); | |
1276 | if (!err.empty()) { | |
1277 | lderr(cct()) << key << " : " << err << dendl; | |
1278 | return -EINVAL; | |
1279 | } | |
1280 | } else { | |
1281 | return -ENOENT; | |
1282 | } | |
1283 | return 0; | |
1284 | } | |
1285 | ||
1286 | int libradosstriper::RadosStriperImpl::extract_sizet_attr | |
1287 | (std::map<std::string, bufferlist> &attrs, | |
1288 | const std::string& key, | |
1289 | size_t *value) | |
1290 | { | |
1291 | std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key); | |
1292 | if (attrsIt != attrs.end()) { | |
1293 | // this intermediate string allows to add a null terminator before calling strtol | |
1294 | std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length()); | |
1295 | std::string err; | |
1296 | *value = strict_strtoll(strvalue.c_str(), 10, &err); | |
1297 | if (!err.empty()) { | |
1298 | lderr(cct()) << key << " : " << err << dendl; | |
1299 | return -EINVAL; | |
1300 | } | |
1301 | } else { | |
1302 | return -ENOENT; | |
1303 | } | |
1304 | return 0; | |
1305 | } | |
1306 | ||
1307 | int libradosstriper::RadosStriperImpl::internal_get_layout_and_size( | |
1308 | const std::string& oid, | |
1309 | ceph_file_layout *layout, | |
1310 | uint64_t *size) | |
1311 | { | |
1312 | // get external attributes of the first rados object | |
1313 | std::map<std::string, bufferlist> attrs; | |
1314 | int rc = m_ioCtx.getxattrs(oid, attrs); | |
1315 | if (rc) return rc; | |
1316 | // deal with stripe_unit | |
1317 | rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_UNIT, &layout->fl_stripe_unit); | |
1318 | if (rc) return rc; | |
1319 | // deal with stripe_count | |
1320 | rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_COUNT, &layout->fl_stripe_count); | |
1321 | if (rc) return rc; | |
1322 | // deal with object_size | |
1323 | rc = extract_uint32_attr(attrs, XATTR_LAYOUT_OBJECT_SIZE, &layout->fl_object_size); | |
1324 | if (rc) return rc; | |
1325 | // deal with size | |
1326 | size_t ssize; | |
1327 | rc = extract_sizet_attr(attrs, XATTR_SIZE, &ssize); | |
1328 | if (rc) { | |
1329 | return rc; | |
1330 | } | |
1331 | *size = ssize; | |
1332 | // make valgrind happy by setting unused fl_pg_pool | |
1333 | layout->fl_pg_pool = 0; | |
1334 | return 0; | |
1335 | } | |
1336 | ||
1337 | int libradosstriper::RadosStriperImpl::openStripedObjectForRead( | |
1338 | const std::string& soid, | |
1339 | ceph_file_layout *layout, | |
1340 | uint64_t *size, | |
1341 | std::string *lockCookie) | |
1342 | { | |
1343 | // take a lock the first rados object, if it exists and gets its size | |
1344 | // check, lock and size reading must be atomic and are thus done within a single operation | |
1345 | librados::ObjectWriteOperation op; | |
1346 | op.assert_exists(); | |
1347 | *lockCookie = getUUID(); | |
1348 | utime_t dur = utime_t(); | |
f67539c2 | 1349 | rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::SHARED, *lockCookie, "Tag", "", dur, 0); |
7c673cae FG |
1350 | std::string firstObjOid = getObjectId(soid, 0); |
1351 | int rc = m_ioCtx.operate(firstObjOid, &op); | |
1352 | if (rc) { | |
1353 | // error case (including -ENOENT) | |
1354 | return rc; | |
1355 | } | |
1356 | rc = internal_get_layout_and_size(firstObjOid, layout, size); | |
1357 | if (rc) { | |
1358 | unlockObject(soid, *lockCookie); | |
1359 | lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : " | |
1360 | << "could not load layout and size for " | |
1361 | << soid << " : rc = " << rc << dendl; | |
1362 | } | |
1363 | return rc; | |
1364 | } | |
1365 | ||
1366 | int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string& soid, | |
1367 | ceph_file_layout *layout, | |
1368 | uint64_t *size, | |
1369 | std::string *lockCookie, | |
1370 | bool isFileSizeAbsolute) | |
1371 | { | |
1372 | // take a lock the first rados object, if it exists | |
1373 | // check and lock must be atomic and are thus done within a single operation | |
1374 | librados::ObjectWriteOperation op; | |
1375 | op.assert_exists(); | |
1376 | *lockCookie = getUUID(); | |
1377 | utime_t dur = utime_t(); | |
f67539c2 | 1378 | rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::SHARED, *lockCookie, "Tag", "", dur, 0); |
7c673cae FG |
1379 | std::string firstObjOid = getObjectId(soid, 0); |
1380 | int rc = m_ioCtx.operate(firstObjOid, &op); | |
1381 | if (rc) { | |
1382 | if (rc == -ENOENT) { | |
1383 | // object does not exist, delegate to createEmptyStripedObject | |
1384 | int rc = createAndOpenStripedObject(soid, layout, *size, lockCookie, isFileSizeAbsolute); | |
1385 | // return original size | |
1386 | *size = 0; | |
1387 | return rc; | |
1388 | } else { | |
1389 | return rc; | |
1390 | } | |
1391 | } | |
1392 | // all fine | |
1393 | uint64_t curSize; | |
1394 | rc = internal_get_layout_and_size(firstObjOid, layout, &curSize); | |
1395 | if (rc) { | |
1396 | unlockObject(soid, *lockCookie); | |
1397 | lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : " | |
1398 | << "could not load layout and size for " | |
1399 | << soid << " : rc = " << rc << dendl; | |
1400 | return rc; | |
1401 | } | |
1402 | // atomically update object size, only if smaller than current one | |
1403 | if (!isFileSizeAbsolute) | |
1404 | *size += curSize; | |
1405 | librados::ObjectWriteOperation writeOp; | |
1406 | writeOp.cmpxattr(XATTR_SIZE, LIBRADOS_CMPXATTR_OP_GT, *size); | |
1407 | std::ostringstream oss; | |
1408 | oss << *size; | |
1409 | bufferlist bl; | |
1410 | bl.append(oss.str()); | |
1411 | writeOp.setxattr(XATTR_SIZE, bl); | |
1412 | rc = m_ioCtx.operate(firstObjOid, &writeOp); | |
1413 | // return current size | |
1414 | *size = curSize; | |
1415 | // handle case where objectsize is already bigger than size | |
1416 | if (-ECANCELED == rc) | |
1417 | rc = 0; | |
1418 | if (rc) { | |
1419 | unlockObject(soid, *lockCookie); | |
1420 | lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : " | |
1421 | << "could not set new size for " | |
1422 | << soid << " : rc = " << rc << dendl; | |
1423 | } | |
1424 | return rc; | |
1425 | } | |
1426 | ||
1427 | int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string& soid, | |
1428 | ceph_file_layout *layout, | |
1429 | uint64_t size, | |
1430 | std::string *lockCookie, | |
1431 | bool isFileSizeAbsolute) | |
1432 | { | |
1433 | // build atomic write operation | |
1434 | librados::ObjectWriteOperation writeOp; | |
1435 | writeOp.create(true); | |
1436 | // object_size | |
1437 | std::ostringstream oss_object_size; | |
1438 | oss_object_size << m_layout.fl_object_size; | |
1439 | bufferlist bl_object_size; | |
1440 | bl_object_size.append(oss_object_size.str()); | |
1441 | writeOp.setxattr(XATTR_LAYOUT_OBJECT_SIZE, bl_object_size); | |
1442 | // stripe unit | |
1443 | std::ostringstream oss_stripe_unit; | |
1444 | oss_stripe_unit << m_layout.fl_stripe_unit; | |
1445 | bufferlist bl_stripe_unit; | |
1446 | bl_stripe_unit.append(oss_stripe_unit.str()); | |
1447 | writeOp.setxattr(XATTR_LAYOUT_STRIPE_UNIT, bl_stripe_unit); | |
1448 | // stripe count | |
1449 | std::ostringstream oss_stripe_count; | |
1450 | oss_stripe_count << m_layout.fl_stripe_count; | |
1451 | bufferlist bl_stripe_count; | |
1452 | bl_stripe_count.append(oss_stripe_count.str()); | |
1453 | writeOp.setxattr(XATTR_LAYOUT_STRIPE_COUNT, bl_stripe_count); | |
1454 | // size | |
1455 | std::ostringstream oss_size; | |
1456 | oss_size << (isFileSizeAbsolute?size:0); | |
1457 | bufferlist bl_size; | |
1458 | bl_size.append(oss_size.str()); | |
1459 | writeOp.setxattr(XATTR_SIZE, bl_size); | |
1460 | // effectively change attributes | |
1461 | std::string firstObjOid = getObjectId(soid, 0); | |
1462 | int rc = m_ioCtx.operate(firstObjOid, &writeOp); | |
1463 | // in case of error (but no EEXIST which would mean the object existed), return | |
1464 | if (rc && -EEXIST != rc) return rc; | |
1465 | // Otherwise open the object | |
1466 | uint64_t fileSize = size; | |
1467 | return openStripedObjectForWrite(soid, layout, &fileSize, lockCookie, isFileSizeAbsolute); | |
1468 | } | |
1469 | ||
1470 | static void striper_truncate_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
1471 | { | |
9f95a23c | 1472 | auto cdata = ceph::ref_t<TruncateCompletionData>(static_cast<TruncateCompletionData*>(arg), false); |
7c673cae FG |
1473 | libradosstriper::MultiAioCompletionImpl *comp = |
1474 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
1475 | if (0 == comp->rval) { | |
1476 | // all went fine, change size in the external attributes | |
1477 | std::ostringstream oss; | |
1478 | oss << cdata->m_size; | |
1479 | bufferlist bl; | |
1480 | bl.append(oss.str()); | |
1481 | cdata->m_striper->setxattr(cdata->m_soid, XATTR_SIZE, bl); | |
1482 | } | |
7c673cae FG |
1483 | } |
1484 | ||
1485 | int libradosstriper::RadosStriperImpl::truncate(const std::string& soid, | |
1486 | uint64_t original_size, | |
1487 | uint64_t size, | |
1488 | ceph_file_layout &layout) | |
1489 | { | |
9f95a23c | 1490 | auto cdata = ceph::make_ref<TruncateCompletionData>(this, soid, size); |
224ce89b WB |
1491 | libradosstriper::MultiAioCompletionImplPtr multi_completion{ |
1492 | new libradosstriper::MultiAioCompletionImpl, false}; | |
9f95a23c | 1493 | multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_truncate_aio_req_complete); |
7c673cae FG |
1494 | // call asynchrous version of truncate |
1495 | int rc = aio_truncate(soid, multi_completion, original_size, size, layout); | |
1496 | // wait for completion of the truncation | |
1497 | multi_completion->finish_adding_requests(); | |
1498 | multi_completion->wait_for_complete_and_cb(); | |
1499 | // return result | |
1500 | if (rc == 0) { | |
1501 | rc = multi_completion->get_return_value(); | |
1502 | } | |
7c673cae FG |
1503 | return rc; |
1504 | } | |
1505 | ||
1506 | int libradosstriper::RadosStriperImpl::aio_truncate | |
1507 | (const std::string& soid, | |
224ce89b | 1508 | libradosstriper::MultiAioCompletionImplPtr multi_completion, |
7c673cae FG |
1509 | uint64_t original_size, |
1510 | uint64_t size, | |
1511 | ceph_file_layout &layout) | |
1512 | { | |
1513 | // handle the underlying rados objects. 3 cases here : | |
1514 | // -- the objects belonging to object sets entirely located | |
1515 | // before the truncation are unchanged | |
1516 | // -- the objects belonging to the object set where the | |
1517 | // truncation took place are truncated or removed | |
1518 | // -- the objects belonging to object sets entirely located | |
1519 | // after the truncation are removed | |
1520 | // Note that we do it backward and that we change the size in | |
1521 | // the external attributes only at the end. This make sure that | |
1522 | // no rados object stays behind if we remove the striped object | |
1523 | // after a truncation has failed | |
1524 | uint64_t trunc_objectsetno = size / layout.fl_object_size / layout.fl_stripe_count; | |
1525 | uint64_t last_objectsetno = original_size / layout.fl_object_size / layout.fl_stripe_count; | |
1526 | bool exists = false; | |
1527 | for (int64_t objectno = (last_objectsetno+1) * layout.fl_stripe_count-1; | |
1528 | objectno >= (int64_t)((trunc_objectsetno + 1) * layout.fl_stripe_count); | |
1529 | objectno--) { | |
1530 | // if no object existed so far, check object existence | |
1531 | if (!exists) { | |
1532 | uint64_t nb_full_object_set = objectno / layout.fl_stripe_count; | |
1533 | uint64_t object_index_in_set = objectno % layout.fl_stripe_count; | |
1534 | uint64_t set_start_off = nb_full_object_set * layout.fl_object_size * layout.fl_stripe_count; | |
1535 | uint64_t object_start_off = set_start_off + object_index_in_set * layout.fl_stripe_unit; | |
1536 | exists = (original_size > object_start_off); | |
1537 | } | |
1538 | if (exists) { | |
1539 | // remove asynchronously | |
1540 | multi_completion->add_request(); | |
9f95a23c | 1541 | auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct()); |
7c673cae | 1542 | librados::AioCompletion *rados_completion = |
9f95a23c TL |
1543 | librados::Rados::aio_create_completion(data->get() /* create ref! */, |
1544 | rados_req_remove_complete); | |
7c673cae FG |
1545 | int rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion); |
1546 | rados_completion->release(); | |
1547 | // in case the object did not exist, it means we had a sparse file, all is fine | |
1548 | if (rc && rc != -ENOENT) return rc; | |
1549 | } | |
1550 | } | |
1551 | for (int64_t objectno = ((trunc_objectsetno + 1) * layout.fl_stripe_count) -1; | |
1552 | objectno >= (int64_t)(trunc_objectsetno * layout.fl_stripe_count); | |
1553 | objectno--) { | |
1554 | // if no object existed so far, check object existence | |
1555 | if (!exists) { | |
1556 | uint64_t object_start_off = ((objectno / layout.fl_stripe_count) * layout.fl_object_size) + | |
1557 | ((objectno % layout.fl_stripe_count) * layout.fl_stripe_unit); | |
1558 | exists = (original_size > object_start_off); | |
1559 | } | |
1560 | if (exists) { | |
1561 | // truncate | |
1562 | file_layout_t l; | |
1563 | l.from_legacy(layout); | |
1564 | uint64_t new_object_size = Striper::object_truncate_size(cct(), &l, objectno, size); | |
1565 | int rc; | |
1566 | if (new_object_size > 0 or 0 == objectno) { | |
1567 | // trunc is synchronous as there is no async version | |
1568 | // but note that only a single object will be truncated | |
1569 | // reducing the overload to a fixed amount | |
1570 | rc = m_ioCtx.trunc(getObjectId(soid, objectno), new_object_size); | |
1571 | } else { | |
1572 | // removes are asynchronous in order to speed up truncations of big files | |
1573 | multi_completion->add_request(); | |
9f95a23c | 1574 | auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct()); |
7c673cae | 1575 | librados::AioCompletion *rados_completion = |
9f95a23c TL |
1576 | librados::Rados::aio_create_completion(data->get() /* create ref! */, |
1577 | rados_req_remove_complete); | |
7c673cae FG |
1578 | rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion); |
1579 | rados_completion->release(); | |
1580 | } | |
1581 | // in case the object did not exist, it means we had a sparse file, all is fine | |
1582 | if (rc && rc != -ENOENT) return rc; | |
1583 | } | |
1584 | } | |
1585 | return 0; | |
1586 | } | |
1587 | ||
1588 | int libradosstriper::RadosStriperImpl::grow(const std::string& soid, | |
1589 | uint64_t original_size, | |
1590 | uint64_t size, | |
1591 | ceph_file_layout &layout) | |
1592 | { | |
1593 | // handle the underlying rados objects. As we support sparse objects, | |
1594 | // we only have to change the size in the external attributes | |
1595 | std::ostringstream oss; | |
1596 | oss << size; | |
1597 | bufferlist bl; | |
1598 | bl.append(oss.str()); | |
1599 | int rc = m_ioCtx.setxattr(getObjectId(soid, 0), XATTR_SIZE, bl); | |
1600 | return rc; | |
1601 | } | |
1602 | ||
1603 | std::string libradosstriper::RadosStriperImpl::getUUID() | |
1604 | { | |
1605 | struct uuid_d uuid; | |
1606 | uuid.generate_random(); | |
1607 | char suuid[37]; | |
1608 | uuid.print(suuid); | |
1609 | return std::string(suuid); | |
1610 | } |