]>
Commit | Line | Data |
---|---|---|
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- | |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Sebastien Ponce <sebastien.ponce@cern.ch> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include <boost/algorithm/string/replace.hpp> | |
16 | ||
17 | #include "libradosstriper/RadosStriperImpl.h" | |
18 | ||
19 | #include <errno.h> | |
20 | ||
21 | #include <sstream> | |
22 | #include <iomanip> | |
23 | #include <algorithm> | |
24 | ||
25 | #include "include/types.h" | |
26 | #include "include/uuid.h" | |
27 | #include "include/ceph_fs.h" | |
28 | #include "common/dout.h" | |
29 | #include "common/strtol.h" | |
30 | #include "common/RefCountedObj.h" | |
31 | #include "osdc/Striper.h" | |
32 | #include "librados/AioCompletionImpl.h" | |
33 | #include <cls/lock/cls_lock_client.h> | |
34 | ||
35 | /* | |
36 | * This file contents the actual implementation of the rados striped objects interface. | |
37 | * | |
38 | * Striped objects are stored in rados in a set of regular rados objects, after their | |
39 | * content has been striped using the osdc/Striper interface. | |
40 | * | |
41 | * The external attributes of the striped object are mapped to the attributes of the | |
42 | * first underlying object. This first object has a set of extra external attributes | |
43 | * storing the layout of the striped object for future read back. These attributes are : | |
44 | * - striper.layout.object_size : the size of rados objects used. | |
45 | * Must be a multiple of striper.layout.stripe_unit | |
46 | * - striper.layout.stripe_unit : the size of a stripe unit | |
47 | * - striper.layout.stripe_count : the number of stripes used | |
48 | * - striper.size : total striped object size | |
49 | * | |
50 | * In general operations on striped objects are not atomic. | |
51 | * However, a certain number of safety guards have been put to make the interface closer | |
52 | * to atomicity : | |
53 | * - each data operation takes a shared lock on the first rados object for the | |
54 | * whole time of the operation | |
55 | * - the remove and trunc operations take an exclusive lock on the first rados object | |
56 | * for the whole time of the operation | |
57 | * This makes sure that no removal/truncation of a striped object occurs while | |
58 | * data operations are happening and vice versa. It thus makes sure that the layout | |
59 | * of a striped object does not change during data operation, which is essential for | |
60 | * data consistency. | |
61 | * | |
62 | * Still the writing to a striped object is not atomic. This means in particular that | |
63 | * the size of an object may not be in sync with its content at all times. | |
64 | * As the size is always guaranteed to be updated first and in an atomic way, and as | |
65 | * sparse striped objects are supported (see below), what will typically happen is | |
66 | * that a reader that comes too soon after a write will read 0s instead of the actual | |
67 | * data. | |
68 | * | |
69 | * Note that remove handles the pieces of the striped object in reverse order, | |
70 | * so that the head object is removed last, making the completion of the deletion atomic. | |
71 | * | |
72 | * Striped objects can be sparse, typically in case data was written at the end of the | |
73 | * striped object only. In such a case, some rados objects constituing the striped object | |
74 | * may be missing. Other can be partial (only the beginning will have data) | |
75 | * When dealing with such sparse striped files, missing objects are detected and | |
76 | * considered as full of 0s. They are however not created until real data is written | |
77 | * to them. | |
78 | * | |
79 | * There are a number of missing features/improvements that could be implemented. | |
80 | * Here are some ideas : | |
81 | * - implementation of missing entry points (compared to rados) | |
82 | * In particular : clone_range, sparse_read, exec, aio_flush_async, tmaps, omaps, ... | |
83 | * | |
84 | */ | |
85 | ||
86 | #define dout_subsys ceph_subsys_rados | |
87 | #undef dout_prefix | |
88 | #define dout_prefix *_dout << "libradosstriper: " | |
89 | ||
90 | /// size of xattr buffer | |
91 | #define XATTR_BUFFER_SIZE 32 | |
92 | ||
93 | /// names of the different xattr entries | |
94 | #define XATTR_LAYOUT_STRIPE_UNIT "striper.layout.stripe_unit" | |
95 | #define XATTR_LAYOUT_STRIPE_COUNT "striper.layout.stripe_count" | |
96 | #define XATTR_LAYOUT_OBJECT_SIZE "striper.layout.object_size" | |
97 | #define XATTR_SIZE "striper.size" | |
98 | #define LOCK_PREFIX "lock." | |
99 | ||
100 | /// name of the lock used on objects to ensure layout stability during IO | |
101 | #define RADOS_LOCK_NAME "striper.lock" | |
102 | ||
103 | /// format of the extension of rados objects created for a given striped object | |
104 | #define RADOS_OBJECT_EXTENSION_FORMAT ".%016llx" | |
105 | ||
106 | /// default object layout | |
107 | struct ceph_file_layout default_file_layout = { | |
108 | init_le32(1<<22), // fl_stripe_unit | |
109 | init_le32(1), // fl_stripe_count | |
110 | init_le32(1<<22), // fl_object_size | |
111 | init_le32(0), // fl_cas_hash | |
112 | init_le32(0), // fl_object_stripe_unit | |
113 | init_le32(-1), // fl_unused | |
114 | init_le32(-1), // fl_pg_pool | |
115 | }; | |
116 | ||
117 | using libradosstriper::MultiAioCompletionImplPtr; | |
118 | ||
119 | namespace { | |
120 | ||
121 | ///////////////////////// CompletionData ///////////////////////////// | |
122 | ||
123 | /** | |
124 | * struct handling the data needed to pass to the call back | |
125 | * function in asynchronous operations | |
126 | */ | |
127 | struct CompletionData : RefCountedObject { | |
128 | /// complete method | |
129 | void complete(int r); | |
130 | /// striper to be used to handle the write completion | |
131 | libradosstriper::RadosStriperImpl *m_striper; | |
132 | /// striped object concerned by the write operation | |
133 | std::string m_soid; | |
134 | /// shared lock to be released at completion | |
135 | std::string m_lockCookie; | |
136 | /// completion handler | |
137 | librados::IoCtxImpl::C_aio_Complete *m_ack; | |
138 | protected: | |
139 | CompletionData(libradosstriper::RadosStriperImpl * striper, | |
140 | const std::string& soid, | |
141 | const std::string& lockCookie, | |
142 | librados::AioCompletionImpl *userCompletion = 0); | |
143 | ~CompletionData() override; | |
144 | ||
145 | }; | |
146 | ||
147 | CompletionData::CompletionData | |
148 | (libradosstriper::RadosStriperImpl* striper, | |
149 | const std::string& soid, | |
150 | const std::string& lockCookie, | |
151 | librados::AioCompletionImpl *userCompletion) : | |
152 | RefCountedObject(striper->cct()), | |
153 | m_striper(striper), m_soid(soid), m_lockCookie(lockCookie), m_ack(0) { | |
154 | m_striper->get(); | |
155 | if (userCompletion) { | |
156 | m_ack = new librados::IoCtxImpl::C_aio_Complete(userCompletion); | |
157 | userCompletion->io = striper->m_ioCtxImpl; | |
158 | } | |
159 | } | |
160 | ||
161 | CompletionData::~CompletionData() { | |
162 | if (m_ack) delete m_ack; | |
163 | m_striper->put(); | |
164 | } | |
165 | ||
166 | void CompletionData::complete(int r) { | |
167 | if (m_ack) m_ack->finish(r); | |
168 | } | |
169 | ||
170 | /** | |
171 | * struct handling the data needed to pass to the call back | |
172 | * function in asynchronous read operations | |
173 | */ | |
174 | struct ReadCompletionData : CompletionData { | |
175 | /// bufferlist containing final result | |
176 | bufferlist* m_bl; | |
177 | /// extents that will be read | |
178 | std::vector<ObjectExtent>* m_extents; | |
179 | /// intermediate results | |
180 | std::vector<bufferlist>* m_resultbl; | |
181 | /// return code of read completion, to be remembered until unlocking happened | |
182 | int m_readRc; | |
183 | /// completion object for the unlocking of the striped object at the end of the read | |
184 | librados::AioCompletion *m_unlockCompletion; | |
185 | /// complete method for when reading is over | |
186 | void complete_read(int r); | |
187 | /// complete method for when object is unlocked | |
188 | void complete_unlock(int r); | |
189 | ||
190 | private: | |
191 | FRIEND_MAKE_REF(ReadCompletionData); | |
192 | ReadCompletionData(libradosstriper::RadosStriperImpl * striper, | |
193 | const std::string& soid, | |
194 | const std::string& lockCookie, | |
195 | librados::AioCompletionImpl *userCompletion, | |
196 | bufferlist* bl, | |
197 | std::vector<ObjectExtent>* extents, | |
198 | std::vector<bufferlist>* resultbl); | |
199 | ~ReadCompletionData() override; | |
200 | }; | |
201 | ||
202 | ReadCompletionData::ReadCompletionData | |
203 | (libradosstriper::RadosStriperImpl* striper, | |
204 | const std::string& soid, | |
205 | const std::string& lockCookie, | |
206 | librados::AioCompletionImpl *userCompletion, | |
207 | bufferlist* bl, | |
208 | std::vector<ObjectExtent>* extents, | |
209 | std::vector<bufferlist>* resultbl) : | |
210 | CompletionData(striper, soid, lockCookie, userCompletion), | |
211 | m_bl(bl), m_extents(extents), m_resultbl(resultbl), m_readRc(0), | |
212 | m_unlockCompletion(0) {} | |
213 | ||
214 | ReadCompletionData::~ReadCompletionData() { | |
215 | m_unlockCompletion->release(); | |
216 | delete m_extents; | |
217 | delete m_resultbl; | |
218 | } | |
219 | ||
220 | void ReadCompletionData::complete_read(int r) { | |
221 | // gather data into final buffer | |
222 | Striper::StripedReadResult readResult; | |
223 | vector<bufferlist>::iterator bit = m_resultbl->begin(); | |
224 | for (vector<ObjectExtent>::iterator eit = m_extents->begin(); | |
225 | eit != m_extents->end(); | |
226 | ++eit, ++bit) { | |
227 | readResult.add_partial_result(m_striper->cct(), *bit, eit->buffer_extents); | |
228 | } | |
229 | m_bl->clear(); | |
230 | readResult.assemble_result(m_striper->cct(), *m_bl, true); | |
231 | // Remember return code | |
232 | m_readRc = r; | |
233 | } | |
234 | ||
235 | void ReadCompletionData::complete_unlock(int r) { | |
236 | // call parent's completion method | |
237 | // Note that we ignore the return code of the unlock as we cannot do much about it | |
238 | CompletionData::complete(m_readRc?m_readRc:m_bl->length()); | |
239 | } | |
240 | ||
241 | /** | |
242 | * struct handling the data needed to pass to the call back | |
243 | * function in asynchronous write operations | |
244 | */ | |
245 | struct WriteCompletionData : CompletionData { | |
246 | /// safe completion handler | |
247 | librados::IoCtxImpl::C_aio_Complete *m_safe; | |
248 | /// completion object for the unlocking of the striped object at the end of the write | |
249 | librados::AioCompletion *m_unlockCompletion; | |
250 | /// return code of write completion, to be remembered until unlocking happened | |
251 | int m_writeRc; | |
252 | /// complete method for when writing is over | |
253 | void complete_write(int r); | |
254 | /// complete method for when object is unlocked | |
255 | void complete_unlock(int r); | |
256 | /// safe method | |
257 | void safe(int r); | |
258 | private: | |
259 | FRIEND_MAKE_REF(WriteCompletionData); | |
260 | /// constructor | |
261 | WriteCompletionData(libradosstriper::RadosStriperImpl * striper, | |
262 | const std::string& soid, | |
263 | const std::string& lockCookie, | |
264 | librados::AioCompletionImpl *userCompletion); | |
265 | /// destructor | |
266 | ~WriteCompletionData() override; | |
267 | }; | |
268 | ||
269 | WriteCompletionData::WriteCompletionData | |
270 | (libradosstriper::RadosStriperImpl* striper, | |
271 | const std::string& soid, | |
272 | const std::string& lockCookie, | |
273 | librados::AioCompletionImpl *userCompletion) : | |
274 | CompletionData(striper, soid, lockCookie, userCompletion), | |
275 | m_safe(0), m_unlockCompletion(0), m_writeRc(0) { | |
276 | if (userCompletion) { | |
277 | m_safe = new librados::IoCtxImpl::C_aio_Complete(userCompletion); | |
278 | } | |
279 | } | |
280 | ||
281 | WriteCompletionData::~WriteCompletionData() { | |
282 | m_unlockCompletion->release(); | |
283 | if (m_safe) delete m_safe; | |
284 | } | |
285 | ||
286 | void WriteCompletionData::complete_unlock(int r) { | |
287 | // call parent's completion method | |
288 | // Note that we ignore the return code of the unlock as we cannot do much about it | |
289 | CompletionData::complete(m_writeRc); | |
290 | } | |
291 | ||
292 | void WriteCompletionData::complete_write(int r) { | |
293 | // Remember return code | |
294 | m_writeRc = r; | |
295 | } | |
296 | ||
297 | void WriteCompletionData::safe(int r) { | |
298 | if (m_safe) m_safe->finish(r); | |
299 | } | |
300 | ||
301 | struct RemoveCompletionData : CompletionData { | |
302 | /// removal flags | |
303 | int flags; | |
304 | ||
305 | private: | |
306 | FRIEND_MAKE_REF(RemoveCompletionData); | |
307 | /** | |
308 | * constructor | |
309 | * note that the constructed object will take ownership of the lock | |
310 | */ | |
311 | RemoveCompletionData(libradosstriper::RadosStriperImpl * striper, | |
312 | const std::string& soid, | |
313 | const std::string& lockCookie, | |
314 | librados::AioCompletionImpl *userCompletion, | |
315 | int flags = 0) : | |
316 | CompletionData(striper, soid, lockCookie, userCompletion), flags(flags) {} | |
317 | }; | |
318 | ||
319 | /** | |
320 | * struct handling the data needed to pass to the call back | |
321 | * function in asynchronous truncate operations | |
322 | */ | |
323 | struct TruncateCompletionData : RefCountedObject { | |
324 | /// striper to be used | |
325 | libradosstriper::RadosStriperImpl *m_striper; | |
326 | /// striped object concerned by the truncate operation | |
327 | std::string m_soid; | |
328 | /// the final size of the truncated object | |
329 | uint64_t m_size; | |
330 | ||
331 | private: | |
332 | FRIEND_MAKE_REF(TruncateCompletionData); | |
333 | /// constructor | |
334 | TruncateCompletionData(libradosstriper::RadosStriperImpl* striper, | |
335 | const std::string& soid, | |
336 | uint64_t size) : | |
337 | RefCountedObject(striper->cct()), | |
338 | m_striper(striper), m_soid(soid), m_size(size) { | |
339 | m_striper->get(); | |
340 | } | |
341 | /// destructor | |
342 | ~TruncateCompletionData() override { | |
343 | m_striper->put(); | |
344 | } | |
345 | }; | |
346 | ||
347 | /** | |
348 | * struct handling the data needed to pass to the call back | |
349 | * function in asynchronous read operations of a Rados File | |
350 | */ | |
351 | struct RadosReadCompletionData : RefCountedObject { | |
352 | /// the multi asynch io completion object to be used | |
353 | MultiAioCompletionImplPtr m_multiAioCompl; | |
354 | /// the expected number of bytes | |
355 | uint64_t m_expectedBytes; | |
356 | /// the bufferlist object where data have been written | |
357 | bufferlist *m_bl; | |
358 | ||
359 | private: | |
360 | FRIEND_MAKE_REF(RadosReadCompletionData); | |
361 | /// constructor | |
362 | RadosReadCompletionData(MultiAioCompletionImplPtr multiAioCompl, | |
363 | uint64_t expectedBytes, | |
364 | bufferlist *bl, | |
365 | CephContext *context) : | |
366 | RefCountedObject(context), | |
367 | m_multiAioCompl(multiAioCompl), m_expectedBytes(expectedBytes), m_bl(bl) {} | |
368 | }; | |
369 | ||
370 | /** | |
371 | * struct handling (most of) the data needed to pass to the call back | |
372 | * function in asynchronous stat operations. | |
373 | * Inherited by the actual type for adding time information in different | |
374 | * versions (time_t or struct timespec) | |
375 | */ | |
376 | struct BasicStatCompletionData : CompletionData { | |
377 | // MultiAioCompletionImpl used to handle the double aysnc | |
378 | // call in the back (stat + getxattr) | |
379 | libradosstriper::MultiAioCompletionImpl *m_multiCompletion; | |
380 | // where to store the size of first objct | |
381 | // this will be ignored but we need a place to store it when | |
382 | // async stat is called | |
383 | uint64_t m_objectSize; | |
384 | // where to store the file size | |
385 | uint64_t *m_psize; | |
386 | /// the bufferlist object used for the getxattr call | |
387 | bufferlist m_bl; | |
388 | /// return code of the stat | |
389 | int m_statRC; | |
390 | /// return code of the getxattr | |
391 | int m_getxattrRC; | |
392 | ||
393 | protected: | |
394 | /// constructor | |
395 | BasicStatCompletionData(libradosstriper::RadosStriperImpl* striper, | |
396 | const std::string& soid, | |
397 | librados::AioCompletionImpl *userCompletion, | |
398 | libradosstriper::MultiAioCompletionImpl *multiCompletion, | |
399 | uint64_t *psize) : | |
400 | CompletionData(striper, soid, "", userCompletion), | |
401 | m_multiCompletion(multiCompletion), m_psize(psize), | |
402 | m_statRC(0), m_getxattrRC(0) {}; | |
403 | ||
404 | }; | |
405 | ||
406 | /** | |
407 | * struct handling the data needed to pass to the call back | |
408 | * function in asynchronous stat operations. | |
409 | * Simple templated extension of BasicStatCompletionData. | |
410 | * The template parameter is the type of the time information | |
411 | * (used with time_t for stat and struct timespec for stat2) | |
412 | */ | |
413 | template<class TimeType> | |
414 | struct StatCompletionData : BasicStatCompletionData { | |
415 | // where to store the file time | |
416 | TimeType *m_pmtime; | |
417 | private: | |
418 | FRIEND_MAKE_REF(StatCompletionData); | |
419 | /// constructor | |
420 | StatCompletionData<TimeType>(libradosstriper::RadosStriperImpl* striper, | |
421 | const std::string& soid, | |
422 | librados::AioCompletionImpl *userCompletion, | |
423 | libradosstriper::MultiAioCompletionImpl *multiCompletion, | |
424 | uint64_t *psize, | |
425 | TimeType *pmtime) : | |
426 | BasicStatCompletionData(striper, soid, userCompletion, multiCompletion, psize), | |
427 | m_pmtime(pmtime) {}; | |
428 | }; | |
429 | ||
430 | /** | |
431 | * struct handling the data needed to pass to the call back | |
432 | * function in asynchronous remove operations of a Rados File | |
433 | */ | |
434 | struct RadosRemoveCompletionData : RefCountedObject { | |
435 | /// the multi asynch io completion object to be used | |
436 | MultiAioCompletionImplPtr m_multiAioCompl; | |
437 | private: | |
438 | FRIEND_MAKE_REF(RadosRemoveCompletionData); | |
439 | /// constructor | |
440 | RadosRemoveCompletionData(MultiAioCompletionImplPtr multiAioCompl, | |
441 | CephContext *context) : | |
442 | RefCountedObject(context), | |
443 | m_multiAioCompl(multiAioCompl) {}; | |
444 | }; | |
445 | ||
446 | ||
447 | } // namespace { | |
448 | ||
449 | ///////////////////////// constructor ///////////////////////////// | |
450 | ||
451 | libradosstriper::RadosStriperImpl::RadosStriperImpl(librados::IoCtx& ioctx, librados::IoCtxImpl *ioctx_impl) : | |
452 | m_refCnt(0), m_radosCluster(ioctx), m_ioCtx(ioctx), m_ioCtxImpl(ioctx_impl), | |
453 | m_layout(default_file_layout) {} | |
454 | ||
455 | ///////////////////////// layout ///////////////////////////// | |
456 | ||
457 | int libradosstriper::RadosStriperImpl::setObjectLayoutStripeUnit | |
458 | (unsigned int stripe_unit) | |
459 | { | |
460 | /* stripe unit must be non-zero, 64k increment */ | |
461 | if (!stripe_unit || (stripe_unit & (CEPH_MIN_STRIPE_UNIT-1))) | |
462 | return -EINVAL; | |
463 | m_layout.fl_stripe_unit = stripe_unit; | |
464 | return 0; | |
465 | } | |
466 | ||
467 | int libradosstriper::RadosStriperImpl::setObjectLayoutStripeCount | |
468 | (unsigned int stripe_count) | |
469 | { | |
470 | /* stripe count must be non-zero */ | |
471 | if (!stripe_count) | |
472 | return -EINVAL; | |
473 | m_layout.fl_stripe_count = stripe_count; | |
474 | return 0; | |
475 | } | |
476 | ||
477 | int libradosstriper::RadosStriperImpl::setObjectLayoutObjectSize | |
478 | (unsigned int object_size) | |
479 | { | |
480 | /* object size must be non-zero, 64k increment */ | |
481 | if (!object_size || (object_size & (CEPH_MIN_STRIPE_UNIT-1))) | |
482 | return -EINVAL; | |
483 | /* object size must be a multiple of stripe unit */ | |
484 | if (object_size < m_layout.fl_stripe_unit || | |
485 | object_size % m_layout.fl_stripe_unit) | |
486 | return -EINVAL; | |
487 | m_layout.fl_object_size = object_size; | |
488 | return 0; | |
489 | } | |
490 | ||
491 | ///////////////////////// xattrs ///////////////////////////// | |
492 | ||
493 | int libradosstriper::RadosStriperImpl::getxattr(const object_t& soid, | |
494 | const char *name, | |
495 | bufferlist& bl) | |
496 | { | |
497 | std::string firstObjOid = getObjectId(soid, 0); | |
498 | return m_ioCtx.getxattr(firstObjOid, name, bl); | |
499 | } | |
500 | ||
501 | int libradosstriper::RadosStriperImpl::setxattr(const object_t& soid, | |
502 | const char *name, | |
503 | bufferlist& bl) | |
504 | { | |
505 | std::string firstObjOid = getObjectId(soid, 0); | |
506 | return m_ioCtx.setxattr(firstObjOid, name, bl); | |
507 | } | |
508 | ||
509 | int libradosstriper::RadosStriperImpl::getxattrs(const object_t& soid, | |
510 | map<string, bufferlist>& attrset) | |
511 | { | |
512 | std::string firstObjOid = getObjectId(soid, 0); | |
513 | int rc = m_ioCtx.getxattrs(firstObjOid, attrset); | |
514 | if (rc) return rc; | |
515 | // cleanup internal attributes dedicated to striping and locking | |
516 | attrset.erase(XATTR_LAYOUT_STRIPE_UNIT); | |
517 | attrset.erase(XATTR_LAYOUT_STRIPE_COUNT); | |
518 | attrset.erase(XATTR_LAYOUT_OBJECT_SIZE); | |
519 | attrset.erase(XATTR_SIZE); | |
520 | attrset.erase(std::string(LOCK_PREFIX) + RADOS_LOCK_NAME); | |
521 | return rc; | |
522 | } | |
523 | ||
524 | int libradosstriper::RadosStriperImpl::rmxattr(const object_t& soid, | |
525 | const char *name) | |
526 | { | |
527 | std::string firstObjOid = getObjectId(soid, 0); | |
528 | return m_ioCtx.rmxattr(firstObjOid, name); | |
529 | } | |
530 | ||
531 | ///////////////////////// io ///////////////////////////// | |
532 | ||
533 | int libradosstriper::RadosStriperImpl::write(const std::string& soid, | |
534 | const bufferlist& bl, | |
535 | size_t len, | |
536 | uint64_t off) | |
537 | { | |
538 | // open the object. This will create it if needed, retrieve its layout | |
539 | // and size and take a shared lock on it | |
540 | ceph_file_layout layout; | |
541 | std::string lockCookie; | |
542 | int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true); | |
543 | if (rc) return rc; | |
544 | return write_in_open_object(soid, layout, lockCookie, bl, len, off); | |
545 | } | |
546 | ||
547 | int libradosstriper::RadosStriperImpl::append(const std::string& soid, | |
548 | const bufferlist& bl, | |
549 | size_t len) | |
550 | { | |
551 | // open the object. This will create it if needed, retrieve its layout | |
552 | // and size and take a shared lock on it | |
553 | ceph_file_layout layout; | |
554 | uint64_t size = len; | |
555 | std::string lockCookie; | |
556 | int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false); | |
557 | if (rc) return rc; | |
558 | return write_in_open_object(soid, layout, lockCookie, bl, len, size); | |
559 | } | |
560 | ||
561 | int libradosstriper::RadosStriperImpl::write_full(const std::string& soid, | |
562 | const bufferlist& bl) | |
563 | { | |
564 | int rc = trunc(soid, 0); | |
565 | if (rc && rc != -ENOENT) return rc; // ENOENT is obviously ok | |
566 | return write(soid, bl, bl.length(), 0); | |
567 | } | |
568 | ||
569 | int libradosstriper::RadosStriperImpl::read(const std::string& soid, | |
570 | bufferlist* bl, | |
571 | size_t len, | |
572 | uint64_t off) | |
573 | { | |
574 | // create a completion object | |
575 | librados::AioCompletionImpl c; | |
576 | // call asynchronous method | |
577 | int rc = aio_read(soid, &c, bl, len, off); | |
578 | // and wait for completion | |
579 | if (!rc) { | |
580 | // wait for completion | |
581 | c.wait_for_complete_and_cb(); | |
582 | // return result | |
583 | rc = c.get_return_value(); | |
584 | } | |
585 | return rc; | |
586 | } | |
587 | ||
588 | ///////////////////////// asynchronous io ///////////////////////////// | |
589 | ||
590 | int libradosstriper::RadosStriperImpl::aio_write(const std::string& soid, | |
591 | librados::AioCompletionImpl *c, | |
592 | const bufferlist& bl, | |
593 | size_t len, | |
594 | uint64_t off) | |
595 | { | |
596 | ceph_file_layout layout; | |
597 | std::string lockCookie; | |
598 | int rc = createAndOpenStripedObject(soid, &layout, len+off, &lockCookie, true); | |
599 | if (rc) return rc; | |
600 | return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, off); | |
601 | } | |
602 | ||
603 | int libradosstriper::RadosStriperImpl::aio_append(const std::string& soid, | |
604 | librados::AioCompletionImpl *c, | |
605 | const bufferlist& bl, | |
606 | size_t len) | |
607 | { | |
608 | ceph_file_layout layout; | |
609 | uint64_t size = len; | |
610 | std::string lockCookie; | |
611 | int rc = openStripedObjectForWrite(soid, &layout, &size, &lockCookie, false); | |
612 | if (rc) return rc; | |
613 | // create a completion object | |
614 | return aio_write_in_open_object(soid, c, layout, lockCookie, bl, len, size); | |
615 | } | |
616 | ||
617 | int libradosstriper::RadosStriperImpl::aio_write_full(const std::string& soid, | |
618 | librados::AioCompletionImpl *c, | |
619 | const bufferlist& bl) | |
620 | { | |
621 | int rc = trunc(soid, 0); | |
622 | if (rc) return rc; | |
623 | return aio_write(soid, c, bl, bl.length(), 0); | |
624 | } | |
625 | ||
626 | static void rados_read_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg) | |
627 | { | |
628 | auto cdata = ceph::ref_t<ReadCompletionData>(static_cast<ReadCompletionData*>(arg), false); | |
629 | libradosstriper::MultiAioCompletionImpl *comp = | |
630 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
631 | cdata->complete_unlock(comp->rval); | |
632 | } | |
633 | ||
634 | static void striper_read_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
635 | { | |
636 | auto cdata = static_cast<ReadCompletionData*>(arg); | |
637 | // launch the async unlocking of the object | |
638 | cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion); | |
639 | // complete the read part in parallel | |
640 | libradosstriper::MultiAioCompletionImpl *comp = | |
641 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
642 | cdata->complete_read(comp->rval); | |
643 | } | |
644 | ||
645 | static void rados_req_read_complete(rados_completion_t c, void *arg) | |
646 | { | |
647 | auto data = static_cast<RadosReadCompletionData*>(arg); | |
648 | int rc = rados_aio_get_return_value(c); | |
649 | // We need to handle the case of sparse files here | |
650 | if (rc == -ENOENT) { | |
651 | // the object did not exist at all. This can happen for sparse files. | |
652 | // we consider we've read 0 bytes and it will fall into next case | |
653 | rc = 0; | |
654 | } | |
655 | ssize_t nread = rc; | |
656 | if (rc >= 0 && (((uint64_t)rc) < data->m_expectedBytes)) { | |
657 | // only partial data were present in the object (or the object did not | |
658 | // even exist if we've gone through previous case). | |
659 | // This is typical of sparse file and we need to complete with 0s. | |
660 | unsigned int lenOfZeros = data->m_expectedBytes-rc; | |
661 | unsigned int existingDataToZero = min(data->m_bl->length()-rc, lenOfZeros); | |
662 | if (existingDataToZero > 0) { | |
663 | data->m_bl->zero(rc, existingDataToZero); | |
664 | } | |
665 | if (lenOfZeros > existingDataToZero) { | |
666 | ceph::bufferptr zeros(ceph::buffer::create(lenOfZeros-existingDataToZero)); | |
667 | zeros.zero(); | |
668 | data->m_bl->push_back(zeros); | |
669 | } | |
670 | nread = data->m_expectedBytes; | |
671 | } | |
672 | auto multi_aio_comp = data->m_multiAioCompl; | |
673 | multi_aio_comp->complete_request(nread); | |
674 | multi_aio_comp->safe_request(rc); | |
675 | } | |
676 | ||
677 | int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid, | |
678 | librados::AioCompletionImpl *c, | |
679 | bufferlist* bl, | |
680 | size_t len, | |
681 | uint64_t off) | |
682 | { | |
683 | // open the object. This will retrieve its layout and size | |
684 | // and take a shared lock on it | |
685 | ceph_file_layout layout; | |
686 | uint64_t size; | |
687 | std::string lockCookie; | |
688 | int rc = openStripedObjectForRead(soid, &layout, &size, &lockCookie); | |
689 | if (rc) return rc; | |
690 | // find out the actual number of bytes we can read | |
691 | uint64_t read_len; | |
692 | if (off >= size) { | |
693 | // nothing to read ! We are done. | |
694 | read_len = 0; | |
695 | } else { | |
696 | read_len = min(len, (size_t)(size-off)); | |
697 | } | |
698 | // get list of extents to be read from | |
699 | vector<ObjectExtent> *extents = new vector<ObjectExtent>(); | |
700 | if (read_len > 0) { | |
701 | std::string format = soid; | |
702 | boost::replace_all(format, "%", "%%"); | |
703 | format += RADOS_OBJECT_EXTENSION_FORMAT; | |
704 | file_layout_t l; | |
705 | l.from_legacy(layout); | |
706 | Striper::file_to_extents(cct(), format.c_str(), &l, off, read_len, | |
707 | 0, *extents); | |
708 | } | |
709 | ||
710 | // create a completion object and transfer ownership of extents and resultbl | |
711 | vector<bufferlist> *resultbl = new vector<bufferlist>(extents->size()); | |
712 | auto cdata = ceph::make_ref<ReadCompletionData>(this, soid, lockCookie, c, bl, extents, resultbl); | |
713 | c->is_read = true; | |
714 | c->io = m_ioCtxImpl; | |
715 | // create a completion for the unlocking of the striped object at the end of the read | |
716 | librados::AioCompletion *unlock_completion = | |
717 | librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_read_aio_unlock_complete); | |
718 | cdata->m_unlockCompletion = unlock_completion; | |
719 | // create the multiCompletion object handling the reads | |
720 | MultiAioCompletionImplPtr nc{new libradosstriper::MultiAioCompletionImpl, | |
721 | false}; | |
722 | nc->set_complete_callback(cdata.get(), striper_read_aio_req_complete); | |
723 | // go through the extents | |
724 | int r = 0, i = 0; | |
725 | for (vector<ObjectExtent>::iterator p = extents->begin(); p != extents->end(); ++p) { | |
726 | // create a buffer list describing where to place data read from current extend | |
727 | bufferlist *oid_bl = &((*resultbl)[i++]); | |
728 | for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin(); | |
729 | q != p->buffer_extents.end(); | |
730 | ++q) { | |
731 | bufferlist buffer_bl; | |
732 | buffer_bl.substr_of(*bl, q->first, q->second); | |
733 | oid_bl->append(buffer_bl); | |
734 | } | |
735 | // read all extends of a given object in one go | |
736 | nc->add_request(); | |
737 | // we need 2 references on data as both rados_req_read_safe and rados_req_read_complete | |
738 | // will release one | |
739 | auto data = ceph::make_ref<RadosReadCompletionData>(nc, p->length, oid_bl, cct()); | |
740 | librados::AioCompletion *rados_completion = | |
741 | librados::Rados::aio_create_completion(data.detach(), rados_req_read_complete); | |
742 | r = m_ioCtx.aio_read(p->oid.name, rados_completion, oid_bl, p->length, p->offset); | |
743 | rados_completion->release(); | |
744 | if (r < 0) | |
745 | break; | |
746 | } | |
747 | nc->finish_adding_requests(); | |
748 | return r; | |
749 | } | |
750 | ||
751 | int libradosstriper::RadosStriperImpl::aio_read(const std::string& soid, | |
752 | librados::AioCompletionImpl *c, | |
753 | char* buf, | |
754 | size_t len, | |
755 | uint64_t off) | |
756 | { | |
757 | // create a buffer list and store it inside the completion object | |
758 | c->bl.clear(); | |
759 | c->bl.push_back(buffer::create_static(len, buf)); | |
760 | // call the bufferlist version of this method | |
761 | return aio_read(soid, c, &c->bl, len, off); | |
762 | } | |
763 | ||
764 | int libradosstriper::RadosStriperImpl::aio_flush() | |
765 | { | |
766 | int ret; | |
767 | // pass to the rados level | |
768 | ret = m_ioCtx.aio_flush(); | |
769 | if (ret < 0) | |
770 | return ret; | |
771 | //wait all CompletionData are released | |
772 | std::unique_lock l{lock}; | |
773 | cond.wait(l, [this] {return m_refCnt <= 1;}); | |
774 | return ret; | |
775 | } | |
776 | ||
777 | ///////////////////////// stat and deletion ///////////////////////////// | |
778 | ||
779 | int libradosstriper::RadosStriperImpl::stat(const std::string& soid, uint64_t *psize, time_t *pmtime) | |
780 | { | |
781 | // create a completion object | |
782 | librados::AioCompletionImpl c; | |
783 | // call asynchronous version of stat | |
784 | int rc = aio_stat(soid, &c, psize, pmtime); | |
785 | if (rc == 0) { | |
786 | // wait for completion of the remove | |
787 | c.wait_for_complete(); | |
788 | // get result | |
789 | rc = c.get_return_value(); | |
790 | } | |
791 | return rc; | |
792 | } | |
793 | ||
794 | static void striper_stat_aio_stat_complete(rados_completion_t c, void *arg) { | |
795 | auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false); | |
796 | int rc = rados_aio_get_return_value(c); | |
797 | if (rc == -ENOENT) { | |
798 | // remember this has failed | |
799 | data->m_statRC = rc; | |
800 | } | |
801 | data->m_multiCompletion->complete_request(rc); | |
802 | } | |
803 | ||
804 | static void striper_stat_aio_getxattr_complete(rados_completion_t c, void *arg) { | |
805 | auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false); | |
806 | int rc = rados_aio_get_return_value(c); | |
807 | // We need to handle the case of sparse files here | |
808 | if (rc < 0) { | |
809 | // remember this has failed | |
810 | data->m_getxattrRC = rc; | |
811 | } else { | |
812 | // this intermediate string allows to add a null terminator before calling strtol | |
813 | std::string err; | |
814 | std::string strsize(data->m_bl.c_str(), data->m_bl.length()); | |
815 | *data->m_psize = strict_strtoll(strsize.c_str(), 10, &err); | |
816 | if (!err.empty()) { | |
817 | lderr(data->m_striper->cct()) << XATTR_SIZE << " : " << err << dendl; | |
818 | data->m_getxattrRC = -EINVAL; | |
819 | } | |
820 | rc = 0; | |
821 | } | |
822 | data->m_multiCompletion->complete_request(rc); | |
823 | } | |
824 | ||
825 | static void striper_stat_aio_req_complete(rados_striper_multi_completion_t c, | |
826 | void *arg) { | |
827 | auto data = ceph::ref_t<BasicStatCompletionData>(static_cast<BasicStatCompletionData*>(arg), false); | |
828 | if (data->m_statRC) { | |
829 | data->complete(data->m_statRC); | |
830 | } else { | |
831 | if (data->m_getxattrRC < 0) { | |
832 | data->complete(data->m_getxattrRC); | |
833 | } else { | |
834 | data->complete(0); | |
835 | } | |
836 | } | |
837 | } | |
838 | ||
839 | template<class TimeType> | |
840 | int libradosstriper::RadosStriperImpl::aio_generic_stat | |
841 | (const std::string& soid, | |
842 | librados::AioCompletionImpl *c, | |
843 | uint64_t *psize, | |
844 | TimeType *pmtime, | |
845 | typename libradosstriper::RadosStriperImpl::StatFunction<TimeType>::Type statFunction) | |
846 | { | |
847 | // use a MultiAioCompletion object for dealing with the fact | |
848 | // that we'll do 2 asynchronous calls in parallel | |
849 | MultiAioCompletionImplPtr multi_completion{ | |
850 | new libradosstriper::MultiAioCompletionImpl, false}; | |
851 | // Data object used for passing context to asynchronous calls | |
852 | std::string firstObjOid = getObjectId(soid, 0); | |
853 | auto cdata = ceph::make_ref<StatCompletionData<TimeType>>(this, firstObjOid, c, multi_completion.get(), psize, pmtime); | |
854 | multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_stat_aio_req_complete); | |
855 | // use a regular AioCompletion for the stat async call | |
856 | librados::AioCompletion *stat_completion = | |
857 | librados::Rados::aio_create_completion(cdata->get() /* create ref! */, striper_stat_aio_stat_complete); | |
858 | multi_completion->add_safe_request(); | |
859 | object_t obj(firstObjOid); | |
860 | int rc = (m_ioCtxImpl->*statFunction)(obj, stat_completion->pc, | |
861 | &cdata->m_objectSize, cdata->m_pmtime); | |
862 | stat_completion->release(); | |
863 | if (rc < 0) { | |
864 | // nothing is really started so cancel everything | |
865 | delete cdata.detach(); | |
866 | return rc; | |
867 | } | |
868 | // use a regular AioCompletion for the getxattr async call | |
869 | librados::AioCompletion *getxattr_completion = | |
870 | librados::Rados::aio_create_completion(cdata->get() /* create ref! */, striper_stat_aio_getxattr_complete); | |
871 | multi_completion->add_safe_request(); | |
872 | // in parallel, get the pmsize from the first object asynchronously | |
873 | rc = m_ioCtxImpl->aio_getxattr(obj, getxattr_completion->pc, | |
874 | XATTR_SIZE, cdata->m_bl); | |
875 | getxattr_completion->release(); | |
876 | multi_completion->finish_adding_requests(); | |
877 | if (rc < 0) { | |
878 | // the async stat is ongoing, so we need to go on | |
879 | // we mark the getxattr as failed in the data object | |
880 | cdata->m_getxattrRC = rc; | |
881 | multi_completion->complete_request(rc); | |
882 | return rc; | |
883 | } | |
884 | return 0; | |
885 | } | |
886 | ||
887 | int libradosstriper::RadosStriperImpl::aio_stat(const std::string& soid, | |
888 | librados::AioCompletionImpl *c, | |
889 | uint64_t *psize, | |
890 | time_t *pmtime) | |
891 | { | |
892 | return aio_generic_stat<time_t>(soid, c, psize, pmtime, &librados::IoCtxImpl::aio_stat); | |
893 | } | |
894 | ||
895 | int libradosstriper::RadosStriperImpl::stat2(const std::string& soid, uint64_t *psize, struct timespec *pts) | |
896 | { | |
897 | // create a completion object | |
898 | librados::AioCompletionImpl c; | |
899 | // call asynchronous version of stat | |
900 | int rc = aio_stat2(soid, &c, psize, pts); | |
901 | if (rc == 0) { | |
902 | // wait for completion of the remove | |
903 | c.wait_for_complete_and_cb(); | |
904 | // get result | |
905 | rc = c.get_return_value(); | |
906 | } | |
907 | return rc; | |
908 | } | |
909 | ||
910 | int libradosstriper::RadosStriperImpl::aio_stat2(const std::string& soid, | |
911 | librados::AioCompletionImpl *c, | |
912 | uint64_t *psize, | |
913 | struct timespec *pts) | |
914 | { | |
915 | return aio_generic_stat<struct timespec>(soid, c, psize, pts, &librados::IoCtxImpl::aio_stat2); | |
916 | } | |
917 | ||
918 | static void rados_req_remove_complete(rados_completion_t c, void *arg) | |
919 | { | |
920 | auto cdata = static_cast<RadosRemoveCompletionData*>(arg); | |
921 | int rc = rados_aio_get_return_value(c); | |
922 | // in case the object did not exist, it means we had a sparse file, all is fine | |
923 | if (rc == -ENOENT) { | |
924 | rc = 0; | |
925 | } | |
926 | cdata->m_multiAioCompl->complete_request(rc); | |
927 | cdata->m_multiAioCompl->safe_request(rc); | |
928 | } | |
929 | ||
930 | static void striper_remove_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
931 | { | |
932 | auto cdata = ceph::ref_t<RemoveCompletionData>(static_cast<RemoveCompletionData*>(arg), false); | |
933 | libradosstriper::MultiAioCompletionImpl *comp = | |
934 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
935 | ldout(cdata->m_striper->cct(), 10) | |
936 | << "RadosStriperImpl : striper_remove_aio_req_complete called for " | |
937 | << cdata->m_soid << dendl; | |
938 | int rc = comp->rval; | |
939 | if (rc == 0) { | |
940 | // All went fine, synchronously remove first object | |
941 | rc = cdata->m_striper->m_ioCtx.remove(cdata->m_striper->getObjectId(cdata->m_soid, 0), | |
942 | cdata->flags); | |
943 | } else { | |
944 | lderr(cdata->m_striper->cct()) | |
945 | << "RadosStriperImpl : deletion/truncation incomplete for " << cdata->m_soid | |
946 | << ", as errors were encountered. The file is left present but it's content " | |
947 | << " has been partially removed" | |
948 | << dendl; | |
949 | } | |
950 | cdata->complete(rc); | |
951 | } | |
952 | ||
953 | int libradosstriper::RadosStriperImpl::remove(const std::string& soid, int flags) | |
954 | { | |
955 | // create a completion object | |
956 | librados::AioCompletionImpl c; | |
957 | // call asynchronous version of remove | |
958 | int rc = aio_remove(soid, &c, flags); | |
959 | if (rc == 0) { | |
960 | // wait for completion of the remove | |
961 | c.wait_for_complete_and_cb(); | |
962 | // get result | |
963 | rc = c.get_return_value(); | |
964 | } | |
965 | return rc; | |
966 | } | |
967 | ||
968 | int libradosstriper::RadosStriperImpl::aio_remove(const std::string& soid, | |
969 | librados::AioCompletionImpl *c, | |
970 | int flags) | |
971 | { | |
972 | // the RemoveCompletionData object will lock the given soid for the duration | |
973 | // of the removal | |
974 | std::string lockCookie = getUUID(); | |
975 | int rc = m_ioCtx.lock_exclusive(getObjectId(soid, 0), RADOS_LOCK_NAME, lockCookie, "", 0, 0); | |
976 | if (rc) return rc; | |
977 | // create CompletionData for the async remove call | |
978 | auto cdata = ceph::make_ref<RemoveCompletionData>(this, soid, lockCookie, c, flags); | |
979 | MultiAioCompletionImplPtr multi_completion{ | |
980 | new libradosstriper::MultiAioCompletionImpl, false}; | |
981 | multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_remove_aio_req_complete); | |
982 | // call asynchronous internal version of remove | |
983 | ldout(cct(), 10) | |
984 | << "RadosStriperImpl : Aio_remove starting for " | |
985 | << soid << dendl; | |
986 | rc = internal_aio_remove(soid, multi_completion); | |
987 | return rc; | |
988 | } | |
989 | ||
990 | int libradosstriper::RadosStriperImpl::internal_aio_remove( | |
991 | const std::string& soid, | |
992 | MultiAioCompletionImplPtr multi_completion, | |
993 | int flags) | |
994 | { | |
995 | std::string firstObjOid = getObjectId(soid, 0); | |
996 | try { | |
997 | // check size and get number of rados objects to delete | |
998 | uint64_t nb_objects = 0; | |
999 | bufferlist bl2; | |
1000 | int rc = getxattr(soid, XATTR_SIZE, bl2); | |
1001 | if (rc < 0) { | |
1002 | // no object size (or not able to get it) | |
1003 | // try to find the number of object "by hand" | |
1004 | uint64_t psize; | |
1005 | time_t pmtime; | |
1006 | while (!m_ioCtx.stat(getObjectId(soid, nb_objects), &psize, &pmtime)) { | |
1007 | nb_objects++; | |
1008 | } | |
1009 | } else { | |
1010 | // count total number of rados objects in the striped object | |
1011 | std::string err; | |
1012 | // this intermediate string allows to add a null terminator before calling strtol | |
1013 | std::string strsize(bl2.c_str(), bl2.length()); | |
1014 | uint64_t size = strict_strtoll(strsize.c_str(), 10, &err); | |
1015 | if (!err.empty()) { | |
1016 | lderr(cct()) << XATTR_SIZE << " : " << err << dendl; | |
1017 | ||
1018 | return -EINVAL; | |
1019 | } | |
1020 | uint64_t object_size = m_layout.fl_object_size; | |
1021 | uint64_t su = m_layout.fl_stripe_unit; | |
1022 | uint64_t stripe_count = m_layout.fl_stripe_count; | |
1023 | uint64_t nb_complete_sets = size / (object_size*stripe_count); | |
1024 | uint64_t remaining_data = size % (object_size*stripe_count); | |
1025 | uint64_t remaining_stripe_units = (remaining_data + su -1) / su; | |
1026 | uint64_t remaining_objects = std::min(remaining_stripe_units, stripe_count); | |
1027 | nb_objects = nb_complete_sets * stripe_count + remaining_objects; | |
1028 | } | |
1029 | // delete rados objects in reverse order | |
1030 | // Note that we do not drop the first object. This one will only be dropped | |
1031 | // if all other removals have been successful, and this is done in the | |
1032 | // callback of the multi_completion object | |
1033 | int rcr = 0; | |
1034 | for (int i = nb_objects-1; i >= 1; i--) { | |
1035 | multi_completion->add_request(); | |
1036 | auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct()); | |
1037 | librados::AioCompletion *rados_completion = | |
1038 | librados::Rados::aio_create_completion(data->get() /* create ref! */, | |
1039 | rados_req_remove_complete); | |
1040 | if (flags == 0) { | |
1041 | rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion); | |
1042 | } else { | |
1043 | rcr = m_ioCtx.aio_remove(getObjectId(soid, i), rados_completion, flags); | |
1044 | } | |
1045 | rados_completion->release(); | |
1046 | if (rcr < 0 and -ENOENT != rcr) { | |
1047 | lderr(cct()) << "RadosStriperImpl::remove : deletion incomplete for " << soid | |
1048 | << ", as " << getObjectId(soid, i) << " could not be deleted (rc=" << rc << ")" | |
1049 | << dendl; | |
1050 | break; | |
1051 | } | |
1052 | } | |
1053 | // we are over adding requests to the multi_completion object | |
1054 | multi_completion->finish_adding_requests(); | |
1055 | // return | |
1056 | return rcr; | |
1057 | } catch (ErrorCode &e) { | |
1058 | // error caught when trying to take the exclusive lock | |
1059 | return e.m_code; | |
1060 | } | |
1061 | ||
1062 | } | |
1063 | ||
1064 | int libradosstriper::RadosStriperImpl::trunc(const std::string& soid, uint64_t size) | |
1065 | { | |
1066 | // lock the object in exclusive mode | |
1067 | std::string firstObjOid = getObjectId(soid, 0); | |
1068 | librados::ObjectWriteOperation op; | |
1069 | op.assert_exists(); | |
1070 | std::string lockCookie = RadosStriperImpl::getUUID(); | |
1071 | utime_t dur = utime_t(); | |
1072 | rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::EXCLUSIVE, lockCookie, "", "", dur, 0); | |
1073 | int rc = m_ioCtx.operate(firstObjOid, &op); | |
1074 | if (rc) return rc; | |
1075 | // load layout and size | |
1076 | ceph_file_layout layout; | |
1077 | uint64_t original_size; | |
1078 | rc = internal_get_layout_and_size(firstObjOid, &layout, &original_size); | |
1079 | if (!rc) { | |
1080 | if (size < original_size) { | |
1081 | rc = truncate(soid, original_size, size, layout); | |
1082 | } else if (size > original_size) { | |
1083 | rc = grow(soid, original_size, size, layout); | |
1084 | } | |
1085 | } | |
1086 | // unlock object, ignore return code as we cannot do much | |
1087 | m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie); | |
1088 | // final return | |
1089 | return rc; | |
1090 | } | |
1091 | ||
1092 | ||
1093 | ///////////////////////// private helpers ///////////////////////////// | |
1094 | ||
1095 | std::string libradosstriper::RadosStriperImpl::getObjectId(const object_t& soid, | |
1096 | long long unsigned objectno) | |
1097 | { | |
1098 | std::ostringstream s; | |
1099 | s << soid << '.' << std::setfill ('0') << std::setw(16) << std::hex << objectno; | |
1100 | return s.str(); | |
1101 | } | |
1102 | ||
1103 | void libradosstriper::RadosStriperImpl::unlockObject(const std::string& soid, | |
1104 | const std::string& lockCookie) | |
1105 | { | |
1106 | // unlock the shared lock on the first rados object | |
1107 | std::string firstObjOid = getObjectId(soid, 0); | |
1108 | m_ioCtx.unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie); | |
1109 | } | |
1110 | ||
1111 | void libradosstriper::RadosStriperImpl::aio_unlockObject(const std::string& soid, | |
1112 | const std::string& lockCookie, | |
1113 | librados::AioCompletion *c) | |
1114 | { | |
1115 | // unlock the shared lock on the first rados object | |
1116 | std::string firstObjOid = getObjectId(soid, 0); | |
1117 | m_ioCtx.aio_unlock(firstObjOid, RADOS_LOCK_NAME, lockCookie, c); | |
1118 | } | |
1119 | ||
1120 | static void rados_write_aio_unlock_complete(rados_striper_multi_completion_t c, void *arg) | |
1121 | { | |
1122 | auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false); | |
1123 | libradosstriper::MultiAioCompletionImpl *comp = | |
1124 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
1125 | cdata->complete_unlock(comp->rval); | |
1126 | } | |
1127 | ||
1128 | static void striper_write_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
1129 | { | |
1130 | auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false); | |
1131 | // launch the async unlocking of the object | |
1132 | cdata->m_striper->aio_unlockObject(cdata->m_soid, cdata->m_lockCookie, cdata->m_unlockCompletion); | |
1133 | // complete the write part in parallel | |
1134 | libradosstriper::MultiAioCompletionImpl *comp = | |
1135 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
1136 | cdata->complete_write(comp->rval); | |
1137 | } | |
1138 | ||
1139 | static void striper_write_aio_req_safe(rados_striper_multi_completion_t c, void *arg) | |
1140 | { | |
1141 | auto cdata = ceph::ref_t<WriteCompletionData>(static_cast<WriteCompletionData*>(arg), false); | |
1142 | libradosstriper::MultiAioCompletionImpl *comp = | |
1143 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
1144 | cdata->safe(comp->rval); | |
1145 | } | |
1146 | ||
1147 | int libradosstriper::RadosStriperImpl::write_in_open_object(const std::string& soid, | |
1148 | const ceph_file_layout& layout, | |
1149 | const std::string& lockCookie, | |
1150 | const bufferlist& bl, | |
1151 | size_t len, | |
1152 | uint64_t off) { | |
1153 | // create a completion object to be passed to the callbacks of the multicompletion | |
1154 | // we need 3 references as striper_write_aio_req_complete will release two and | |
1155 | // striper_write_aio_req_safe will release one | |
1156 | auto cdata = ceph::make_ref<WriteCompletionData>(this, soid, lockCookie, nullptr); | |
1157 | // create a completion object for the unlocking of the striped object at the end of the write | |
1158 | librados::AioCompletion *unlock_completion = | |
1159 | librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_write_aio_unlock_complete); | |
1160 | cdata->m_unlockCompletion = unlock_completion; | |
1161 | // create the multicompletion that will handle the write completion | |
1162 | MultiAioCompletionImplPtr c{new libradosstriper::MultiAioCompletionImpl, | |
1163 | false}; | |
1164 | c->set_complete_callback(cdata->get() /* create ref! */, striper_write_aio_req_complete); | |
1165 | c->set_safe_callback(cdata->get() /* create ref! */, striper_write_aio_req_safe); | |
1166 | // call the asynchronous API | |
1167 | int rc = internal_aio_write(soid, c, bl, len, off, layout); | |
1168 | if (!rc) { | |
1169 | // wait for completion and safety of data | |
1170 | c->wait_for_complete_and_cb(); | |
1171 | c->wait_for_safe_and_cb(); | |
1172 | // wait for the unlocking | |
1173 | unlock_completion->wait_for_complete(); | |
1174 | // return result | |
1175 | rc = c->get_return_value(); | |
1176 | } | |
1177 | return rc; | |
1178 | } | |
1179 | ||
1180 | int libradosstriper::RadosStriperImpl::aio_write_in_open_object(const std::string& soid, | |
1181 | librados::AioCompletionImpl *c, | |
1182 | const ceph_file_layout& layout, | |
1183 | const std::string& lockCookie, | |
1184 | const bufferlist& bl, | |
1185 | size_t len, | |
1186 | uint64_t off) { | |
1187 | // create a completion object to be passed to the callbacks of the multicompletion | |
1188 | // we need 3 references as striper_write_aio_req_complete will release two and | |
1189 | // striper_write_aio_req_safe will release one | |
1190 | auto cdata = ceph::make_ref<WriteCompletionData>(this, soid, lockCookie, c); | |
1191 | m_ioCtxImpl->get(); | |
1192 | c->io = m_ioCtxImpl; | |
1193 | // create a completion object for the unlocking of the striped object at the end of the write | |
1194 | librados::AioCompletion *unlock_completion = | |
1195 | librados::Rados::aio_create_completion(cdata->get() /* create ref! */, rados_write_aio_unlock_complete); | |
1196 | cdata->m_unlockCompletion = unlock_completion; | |
1197 | // create the multicompletion that will handle the write completion | |
1198 | libradosstriper::MultiAioCompletionImplPtr nc{ | |
1199 | new libradosstriper::MultiAioCompletionImpl, false}; | |
1200 | nc->set_complete_callback(cdata->get() /* create ref! */, striper_write_aio_req_complete); | |
1201 | nc->set_safe_callback(cdata->get() /* create ref! */, striper_write_aio_req_safe); | |
1202 | // internal asynchronous API | |
1203 | int rc = internal_aio_write(soid, nc, bl, len, off, layout); | |
1204 | return rc; | |
1205 | } | |
1206 | ||
1207 | static void rados_req_write_complete(rados_completion_t c, void *arg) | |
1208 | { | |
1209 | auto comp = reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(arg); | |
1210 | comp->complete_request(rados_aio_get_return_value(c)); | |
1211 | comp->safe_request(rados_aio_get_return_value(c)); | |
1212 | } | |
1213 | ||
1214 | int | |
1215 | libradosstriper::RadosStriperImpl::internal_aio_write(const std::string& soid, | |
1216 | libradosstriper::MultiAioCompletionImplPtr c, | |
1217 | const bufferlist& bl, | |
1218 | size_t len, | |
1219 | uint64_t off, | |
1220 | const ceph_file_layout& layout) | |
1221 | { | |
1222 | int r = 0; | |
1223 | // Do not try anything if we are called with empty buffer, | |
1224 | // file_to_extents would raise an exception | |
1225 | if (len > 0) { | |
1226 | // get list of extents to be written to | |
1227 | vector<ObjectExtent> extents; | |
1228 | std::string format = soid; | |
1229 | boost::replace_all(format, "%", "%%"); | |
1230 | format += RADOS_OBJECT_EXTENSION_FORMAT; | |
1231 | file_layout_t l; | |
1232 | l.from_legacy(layout); | |
1233 | Striper::file_to_extents(cct(), format.c_str(), &l, off, len, 0, extents); | |
1234 | // go through the extents | |
1235 | for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) { | |
1236 | // assemble pieces of a given object into a single buffer list | |
1237 | bufferlist oid_bl; | |
1238 | for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin(); | |
1239 | q != p->buffer_extents.end(); | |
1240 | ++q) { | |
1241 | bufferlist buffer_bl; | |
1242 | buffer_bl.substr_of(bl, q->first, q->second); | |
1243 | oid_bl.append(buffer_bl); | |
1244 | } | |
1245 | // and write the object | |
1246 | c->add_request(); | |
1247 | librados::AioCompletion *rados_completion = | |
1248 | librados::Rados::aio_create_completion(c.get(), | |
1249 | rados_req_write_complete); | |
1250 | r = m_ioCtx.aio_write(p->oid.name, rados_completion, oid_bl, | |
1251 | p->length, p->offset); | |
1252 | rados_completion->release(); | |
1253 | if (r < 0) | |
1254 | break; | |
1255 | } | |
1256 | } | |
1257 | c->finish_adding_requests(); | |
1258 | return r; | |
1259 | } | |
1260 | ||
1261 | int libradosstriper::RadosStriperImpl::extract_uint32_attr | |
1262 | (std::map<std::string, bufferlist> &attrs, | |
1263 | const std::string& key, | |
1264 | ceph_le32 *value) | |
1265 | { | |
1266 | std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key); | |
1267 | if (attrsIt != attrs.end()) { | |
1268 | // this intermediate string allows to add a null terminator before calling strtol | |
1269 | std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length()); | |
1270 | std::string err; | |
1271 | *value = strict_strtol(strvalue.c_str(), 10, &err); | |
1272 | if (!err.empty()) { | |
1273 | lderr(cct()) << key << " : " << err << dendl; | |
1274 | return -EINVAL; | |
1275 | } | |
1276 | } else { | |
1277 | return -ENOENT; | |
1278 | } | |
1279 | return 0; | |
1280 | } | |
1281 | ||
1282 | int libradosstriper::RadosStriperImpl::extract_sizet_attr | |
1283 | (std::map<std::string, bufferlist> &attrs, | |
1284 | const std::string& key, | |
1285 | size_t *value) | |
1286 | { | |
1287 | std::map<std::string, bufferlist>::iterator attrsIt = attrs.find(key); | |
1288 | if (attrsIt != attrs.end()) { | |
1289 | // this intermediate string allows to add a null terminator before calling strtol | |
1290 | std::string strvalue(attrsIt->second.c_str(), attrsIt->second.length()); | |
1291 | std::string err; | |
1292 | *value = strict_strtoll(strvalue.c_str(), 10, &err); | |
1293 | if (!err.empty()) { | |
1294 | lderr(cct()) << key << " : " << err << dendl; | |
1295 | return -EINVAL; | |
1296 | } | |
1297 | } else { | |
1298 | return -ENOENT; | |
1299 | } | |
1300 | return 0; | |
1301 | } | |
1302 | ||
1303 | int libradosstriper::RadosStriperImpl::internal_get_layout_and_size( | |
1304 | const std::string& oid, | |
1305 | ceph_file_layout *layout, | |
1306 | uint64_t *size) | |
1307 | { | |
1308 | // get external attributes of the first rados object | |
1309 | std::map<std::string, bufferlist> attrs; | |
1310 | int rc = m_ioCtx.getxattrs(oid, attrs); | |
1311 | if (rc) return rc; | |
1312 | // deal with stripe_unit | |
1313 | rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_UNIT, &layout->fl_stripe_unit); | |
1314 | if (rc) return rc; | |
1315 | // deal with stripe_count | |
1316 | rc = extract_uint32_attr(attrs, XATTR_LAYOUT_STRIPE_COUNT, &layout->fl_stripe_count); | |
1317 | if (rc) return rc; | |
1318 | // deal with object_size | |
1319 | rc = extract_uint32_attr(attrs, XATTR_LAYOUT_OBJECT_SIZE, &layout->fl_object_size); | |
1320 | if (rc) return rc; | |
1321 | // deal with size | |
1322 | size_t ssize; | |
1323 | rc = extract_sizet_attr(attrs, XATTR_SIZE, &ssize); | |
1324 | if (rc) { | |
1325 | return rc; | |
1326 | } | |
1327 | *size = ssize; | |
1328 | // make valgrind happy by setting unused fl_pg_pool | |
1329 | layout->fl_pg_pool = 0; | |
1330 | return 0; | |
1331 | } | |
1332 | ||
1333 | int libradosstriper::RadosStriperImpl::openStripedObjectForRead( | |
1334 | const std::string& soid, | |
1335 | ceph_file_layout *layout, | |
1336 | uint64_t *size, | |
1337 | std::string *lockCookie) | |
1338 | { | |
1339 | // take a lock the first rados object, if it exists and gets its size | |
1340 | // check, lock and size reading must be atomic and are thus done within a single operation | |
1341 | librados::ObjectWriteOperation op; | |
1342 | op.assert_exists(); | |
1343 | *lockCookie = getUUID(); | |
1344 | utime_t dur = utime_t(); | |
1345 | rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::SHARED, *lockCookie, "Tag", "", dur, 0); | |
1346 | std::string firstObjOid = getObjectId(soid, 0); | |
1347 | int rc = m_ioCtx.operate(firstObjOid, &op); | |
1348 | if (rc) { | |
1349 | // error case (including -ENOENT) | |
1350 | return rc; | |
1351 | } | |
1352 | rc = internal_get_layout_and_size(firstObjOid, layout, size); | |
1353 | if (rc) { | |
1354 | unlockObject(soid, *lockCookie); | |
1355 | lderr(cct()) << "RadosStriperImpl::openStripedObjectForRead : " | |
1356 | << "could not load layout and size for " | |
1357 | << soid << " : rc = " << rc << dendl; | |
1358 | } | |
1359 | return rc; | |
1360 | } | |
1361 | ||
1362 | int libradosstriper::RadosStriperImpl::openStripedObjectForWrite(const std::string& soid, | |
1363 | ceph_file_layout *layout, | |
1364 | uint64_t *size, | |
1365 | std::string *lockCookie, | |
1366 | bool isFileSizeAbsolute) | |
1367 | { | |
1368 | // take a lock the first rados object, if it exists | |
1369 | // check and lock must be atomic and are thus done within a single operation | |
1370 | librados::ObjectWriteOperation op; | |
1371 | op.assert_exists(); | |
1372 | *lockCookie = getUUID(); | |
1373 | utime_t dur = utime_t(); | |
1374 | rados::cls::lock::lock(&op, RADOS_LOCK_NAME, ClsLockType::SHARED, *lockCookie, "Tag", "", dur, 0); | |
1375 | std::string firstObjOid = getObjectId(soid, 0); | |
1376 | int rc = m_ioCtx.operate(firstObjOid, &op); | |
1377 | if (rc) { | |
1378 | if (rc == -ENOENT) { | |
1379 | // object does not exist, delegate to createEmptyStripedObject | |
1380 | int rc = createAndOpenStripedObject(soid, layout, *size, lockCookie, isFileSizeAbsolute); | |
1381 | // return original size | |
1382 | *size = 0; | |
1383 | return rc; | |
1384 | } else { | |
1385 | return rc; | |
1386 | } | |
1387 | } | |
1388 | // all fine | |
1389 | uint64_t curSize; | |
1390 | rc = internal_get_layout_and_size(firstObjOid, layout, &curSize); | |
1391 | if (rc) { | |
1392 | unlockObject(soid, *lockCookie); | |
1393 | lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : " | |
1394 | << "could not load layout and size for " | |
1395 | << soid << " : rc = " << rc << dendl; | |
1396 | return rc; | |
1397 | } | |
1398 | // atomically update object size, only if smaller than current one | |
1399 | if (!isFileSizeAbsolute) | |
1400 | *size += curSize; | |
1401 | librados::ObjectWriteOperation writeOp; | |
1402 | writeOp.cmpxattr(XATTR_SIZE, LIBRADOS_CMPXATTR_OP_GT, *size); | |
1403 | std::ostringstream oss; | |
1404 | oss << *size; | |
1405 | bufferlist bl; | |
1406 | bl.append(oss.str()); | |
1407 | writeOp.setxattr(XATTR_SIZE, bl); | |
1408 | rc = m_ioCtx.operate(firstObjOid, &writeOp); | |
1409 | // return current size | |
1410 | *size = curSize; | |
1411 | // handle case where objectsize is already bigger than size | |
1412 | if (-ECANCELED == rc) | |
1413 | rc = 0; | |
1414 | if (rc) { | |
1415 | unlockObject(soid, *lockCookie); | |
1416 | lderr(cct()) << "RadosStriperImpl::openStripedObjectForWrite : " | |
1417 | << "could not set new size for " | |
1418 | << soid << " : rc = " << rc << dendl; | |
1419 | } | |
1420 | return rc; | |
1421 | } | |
1422 | ||
1423 | int libradosstriper::RadosStriperImpl::createAndOpenStripedObject(const std::string& soid, | |
1424 | ceph_file_layout *layout, | |
1425 | uint64_t size, | |
1426 | std::string *lockCookie, | |
1427 | bool isFileSizeAbsolute) | |
1428 | { | |
1429 | // build atomic write operation | |
1430 | librados::ObjectWriteOperation writeOp; | |
1431 | writeOp.create(true); | |
1432 | // object_size | |
1433 | std::ostringstream oss_object_size; | |
1434 | oss_object_size << m_layout.fl_object_size; | |
1435 | bufferlist bl_object_size; | |
1436 | bl_object_size.append(oss_object_size.str()); | |
1437 | writeOp.setxattr(XATTR_LAYOUT_OBJECT_SIZE, bl_object_size); | |
1438 | // stripe unit | |
1439 | std::ostringstream oss_stripe_unit; | |
1440 | oss_stripe_unit << m_layout.fl_stripe_unit; | |
1441 | bufferlist bl_stripe_unit; | |
1442 | bl_stripe_unit.append(oss_stripe_unit.str()); | |
1443 | writeOp.setxattr(XATTR_LAYOUT_STRIPE_UNIT, bl_stripe_unit); | |
1444 | // stripe count | |
1445 | std::ostringstream oss_stripe_count; | |
1446 | oss_stripe_count << m_layout.fl_stripe_count; | |
1447 | bufferlist bl_stripe_count; | |
1448 | bl_stripe_count.append(oss_stripe_count.str()); | |
1449 | writeOp.setxattr(XATTR_LAYOUT_STRIPE_COUNT, bl_stripe_count); | |
1450 | // size | |
1451 | std::ostringstream oss_size; | |
1452 | oss_size << (isFileSizeAbsolute?size:0); | |
1453 | bufferlist bl_size; | |
1454 | bl_size.append(oss_size.str()); | |
1455 | writeOp.setxattr(XATTR_SIZE, bl_size); | |
1456 | // effectively change attributes | |
1457 | std::string firstObjOid = getObjectId(soid, 0); | |
1458 | int rc = m_ioCtx.operate(firstObjOid, &writeOp); | |
1459 | // in case of error (but no EEXIST which would mean the object existed), return | |
1460 | if (rc && -EEXIST != rc) return rc; | |
1461 | // Otherwise open the object | |
1462 | uint64_t fileSize = size; | |
1463 | return openStripedObjectForWrite(soid, layout, &fileSize, lockCookie, isFileSizeAbsolute); | |
1464 | } | |
1465 | ||
1466 | static void striper_truncate_aio_req_complete(rados_striper_multi_completion_t c, void *arg) | |
1467 | { | |
1468 | auto cdata = ceph::ref_t<TruncateCompletionData>(static_cast<TruncateCompletionData*>(arg), false); | |
1469 | libradosstriper::MultiAioCompletionImpl *comp = | |
1470 | reinterpret_cast<libradosstriper::MultiAioCompletionImpl*>(c); | |
1471 | if (0 == comp->rval) { | |
1472 | // all went fine, change size in the external attributes | |
1473 | std::ostringstream oss; | |
1474 | oss << cdata->m_size; | |
1475 | bufferlist bl; | |
1476 | bl.append(oss.str()); | |
1477 | cdata->m_striper->setxattr(cdata->m_soid, XATTR_SIZE, bl); | |
1478 | } | |
1479 | } | |
1480 | ||
1481 | int libradosstriper::RadosStriperImpl::truncate(const std::string& soid, | |
1482 | uint64_t original_size, | |
1483 | uint64_t size, | |
1484 | ceph_file_layout &layout) | |
1485 | { | |
1486 | auto cdata = ceph::make_ref<TruncateCompletionData>(this, soid, size); | |
1487 | libradosstriper::MultiAioCompletionImplPtr multi_completion{ | |
1488 | new libradosstriper::MultiAioCompletionImpl, false}; | |
1489 | multi_completion->set_complete_callback(cdata->get() /* create ref! */, striper_truncate_aio_req_complete); | |
1490 | // call asynchrous version of truncate | |
1491 | int rc = aio_truncate(soid, multi_completion, original_size, size, layout); | |
1492 | // wait for completion of the truncation | |
1493 | multi_completion->finish_adding_requests(); | |
1494 | multi_completion->wait_for_complete_and_cb(); | |
1495 | // return result | |
1496 | if (rc == 0) { | |
1497 | rc = multi_completion->get_return_value(); | |
1498 | } | |
1499 | return rc; | |
1500 | } | |
1501 | ||
1502 | int libradosstriper::RadosStriperImpl::aio_truncate | |
1503 | (const std::string& soid, | |
1504 | libradosstriper::MultiAioCompletionImplPtr multi_completion, | |
1505 | uint64_t original_size, | |
1506 | uint64_t size, | |
1507 | ceph_file_layout &layout) | |
1508 | { | |
1509 | // handle the underlying rados objects. 3 cases here : | |
1510 | // -- the objects belonging to object sets entirely located | |
1511 | // before the truncation are unchanged | |
1512 | // -- the objects belonging to the object set where the | |
1513 | // truncation took place are truncated or removed | |
1514 | // -- the objects belonging to object sets entirely located | |
1515 | // after the truncation are removed | |
1516 | // Note that we do it backward and that we change the size in | |
1517 | // the external attributes only at the end. This make sure that | |
1518 | // no rados object stays behind if we remove the striped object | |
1519 | // after a truncation has failed | |
1520 | uint64_t trunc_objectsetno = size / layout.fl_object_size / layout.fl_stripe_count; | |
1521 | uint64_t last_objectsetno = original_size / layout.fl_object_size / layout.fl_stripe_count; | |
1522 | bool exists = false; | |
1523 | for (int64_t objectno = (last_objectsetno+1) * layout.fl_stripe_count-1; | |
1524 | objectno >= (int64_t)((trunc_objectsetno + 1) * layout.fl_stripe_count); | |
1525 | objectno--) { | |
1526 | // if no object existed so far, check object existence | |
1527 | if (!exists) { | |
1528 | uint64_t nb_full_object_set = objectno / layout.fl_stripe_count; | |
1529 | uint64_t object_index_in_set = objectno % layout.fl_stripe_count; | |
1530 | uint64_t set_start_off = nb_full_object_set * layout.fl_object_size * layout.fl_stripe_count; | |
1531 | uint64_t object_start_off = set_start_off + object_index_in_set * layout.fl_stripe_unit; | |
1532 | exists = (original_size > object_start_off); | |
1533 | } | |
1534 | if (exists) { | |
1535 | // remove asynchronously | |
1536 | multi_completion->add_request(); | |
1537 | auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct()); | |
1538 | librados::AioCompletion *rados_completion = | |
1539 | librados::Rados::aio_create_completion(data->get() /* create ref! */, | |
1540 | rados_req_remove_complete); | |
1541 | int rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion); | |
1542 | rados_completion->release(); | |
1543 | // in case the object did not exist, it means we had a sparse file, all is fine | |
1544 | if (rc && rc != -ENOENT) return rc; | |
1545 | } | |
1546 | } | |
1547 | for (int64_t objectno = ((trunc_objectsetno + 1) * layout.fl_stripe_count) -1; | |
1548 | objectno >= (int64_t)(trunc_objectsetno * layout.fl_stripe_count); | |
1549 | objectno--) { | |
1550 | // if no object existed so far, check object existence | |
1551 | if (!exists) { | |
1552 | uint64_t object_start_off = ((objectno / layout.fl_stripe_count) * layout.fl_object_size) + | |
1553 | ((objectno % layout.fl_stripe_count) * layout.fl_stripe_unit); | |
1554 | exists = (original_size > object_start_off); | |
1555 | } | |
1556 | if (exists) { | |
1557 | // truncate | |
1558 | file_layout_t l; | |
1559 | l.from_legacy(layout); | |
1560 | uint64_t new_object_size = Striper::object_truncate_size(cct(), &l, objectno, size); | |
1561 | int rc; | |
1562 | if (new_object_size > 0 or 0 == objectno) { | |
1563 | // trunc is synchronous as there is no async version | |
1564 | // but note that only a single object will be truncated | |
1565 | // reducing the overload to a fixed amount | |
1566 | rc = m_ioCtx.trunc(getObjectId(soid, objectno), new_object_size); | |
1567 | } else { | |
1568 | // removes are asynchronous in order to speed up truncations of big files | |
1569 | multi_completion->add_request(); | |
1570 | auto data = ceph::make_ref<RadosRemoveCompletionData>(multi_completion, cct()); | |
1571 | librados::AioCompletion *rados_completion = | |
1572 | librados::Rados::aio_create_completion(data->get() /* create ref! */, | |
1573 | rados_req_remove_complete); | |
1574 | rc = m_ioCtx.aio_remove(getObjectId(soid, objectno), rados_completion); | |
1575 | rados_completion->release(); | |
1576 | } | |
1577 | // in case the object did not exist, it means we had a sparse file, all is fine | |
1578 | if (rc && rc != -ENOENT) return rc; | |
1579 | } | |
1580 | } | |
1581 | return 0; | |
1582 | } | |
1583 | ||
1584 | int libradosstriper::RadosStriperImpl::grow(const std::string& soid, | |
1585 | uint64_t original_size, | |
1586 | uint64_t size, | |
1587 | ceph_file_layout &layout) | |
1588 | { | |
1589 | // handle the underlying rados objects. As we support sparse objects, | |
1590 | // we only have to change the size in the external attributes | |
1591 | std::ostringstream oss; | |
1592 | oss << size; | |
1593 | bufferlist bl; | |
1594 | bl.append(oss.str()); | |
1595 | int rc = m_ioCtx.setxattr(getObjectId(soid, 0), XATTR_SIZE, bl); | |
1596 | return rc; | |
1597 | } | |
1598 | ||
1599 | std::string libradosstriper::RadosStriperImpl::getUUID() | |
1600 | { | |
1601 | struct uuid_d uuid; | |
1602 | uuid.generate_random(); | |
1603 | char suuid[37]; | |
1604 | uuid.print(suuid); | |
1605 | return std::string(suuid); | |
1606 | } |