]> git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/clay/ErasureCodeClay.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / erasure-code / clay / ErasureCodeClay.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2018 Indian Institute of Science <office.ece@iisc.ac.in>
7 *
8 * Author: Myna Vajha <mynaramana@gmail.com>
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 */
16
17 #include <errno.h>
18 #include <algorithm>
19
20 #include "ErasureCodeClay.h"
21
22 #include "common/debug.h"
23 #include "erasure-code/ErasureCodePlugin.h"
24 #include "include/ceph_assert.h"
25 #include "include/str_map.h"
26 #include "include/stringify.h"
27 #include "osd/osd_types.h"
28
29
30 #define dout_context g_ceph_context
31 #define dout_subsys ceph_subsys_osd
32 #undef dout_prefix
33 #define dout_prefix _prefix(_dout)
34
35 #define LARGEST_VECTOR_WORDSIZE 16
36 #define talloc(type, num) (type *) malloc(sizeof(type)*(num))
37
38 using namespace std;
39 using namespace ceph;
40 static ostream& _prefix(std::ostream* _dout)
41 {
42 return *_dout << "ErasureCodeClay: ";
43 }
44
45 static int pow_int(int a, int x) {
46 int power = 1;
47 while (x) {
48 if (x & 1) power *= a;
49 x /= 2;
50 a *= a;
51 }
52 return power;
53 }
54
55 ErasureCodeClay::~ErasureCodeClay()
56 {
57 for (int i = 0; i < q*t; i++) {
58 if (U_buf[i].length() != 0) U_buf[i].clear();
59 }
60 }
61
62 int ErasureCodeClay::init(ErasureCodeProfile &profile,
63 ostream *ss)
64 {
65 int r;
66 r = parse(profile, ss);
67 if (r)
68 return r;
69
70 r = ErasureCode::init(profile, ss);
71 if (r)
72 return r;
73 ErasureCodePluginRegistry &registry = ErasureCodePluginRegistry::instance();
74 r = registry.factory(mds.profile["plugin"],
75 directory,
76 mds.profile,
77 &mds.erasure_code,
78 ss);
79 if (r)
80 return r;
81 r = registry.factory(pft.profile["plugin"],
82 directory,
83 pft.profile,
84 &pft.erasure_code,
85 ss);
86 return r;
87
88 }
89
90 unsigned int ErasureCodeClay::get_chunk_size(unsigned int object_size) const
91 {
92 unsigned int alignment_scalar_code = pft.erasure_code->get_chunk_size(1);
93 unsigned int alignment = sub_chunk_no * k * alignment_scalar_code;
94
95 return round_up_to(object_size, alignment) / k;
96 }
97
98 int ErasureCodeClay::minimum_to_decode(const set<int> &want_to_read,
99 const set<int> &available,
100 map<int, vector<pair<int, int>>> *minimum)
101 {
102 if (is_repair(want_to_read, available)) {
103 return minimum_to_repair(want_to_read, available, minimum);
104 } else {
105 return ErasureCode::minimum_to_decode(want_to_read, available, minimum);
106 }
107 }
108
109 int ErasureCodeClay::decode(const set<int> &want_to_read,
110 const map<int, bufferlist> &chunks,
111 map<int, bufferlist> *decoded, int chunk_size)
112 {
113 set<int> avail;
114 for ([[maybe_unused]] auto& [node, bl] : chunks) {
115 avail.insert(node);
116 (void)bl; // silence -Wunused-variable
117 }
118
119 if (is_repair(want_to_read, avail) &&
120 ((unsigned int)chunk_size > chunks.begin()->second.length())) {
121 return repair(want_to_read, chunks, decoded, chunk_size);
122 } else {
123 return ErasureCode::_decode(want_to_read, chunks, decoded);
124 }
125 }
126
127 void p(const set<int> &s) { cerr << s; } // for gdb
128
129 int ErasureCodeClay::encode_chunks(const set<int> &want_to_encode,
130 map<int, bufferlist> *encoded)
131 {
132 map<int, bufferlist> chunks;
133 set<int> parity_chunks;
134 int chunk_size = (*encoded)[0].length();
135
136 for (int i = 0; i < k + m; i++) {
137 if (i < k) {
138 chunks[i] = (*encoded)[i];
139 } else {
140 chunks[i+nu] = (*encoded)[i];
141 parity_chunks.insert(i+nu);
142 }
143 }
144
145 for (int i = k; i < k + nu; i++) {
146 bufferptr buf(buffer::create_aligned(chunk_size, SIMD_ALIGN));
147 buf.zero();
148 chunks[i].push_back(std::move(buf));
149 }
150
151 int res = decode_layered(parity_chunks, &chunks);
152 for (int i = k ; i < k + nu; i++) {
153 // need to clean some of the intermediate chunks here!!
154 chunks[i].clear();
155 }
156 return res;
157 }
158
159 int ErasureCodeClay::decode_chunks(const set<int> &want_to_read,
160 const map<int, bufferlist> &chunks,
161 map<int, bufferlist> *decoded)
162 {
163 set<int> erasures;
164 map<int, bufferlist> coded_chunks;
165
166 for (int i = 0; i < k + m; i++) {
167 if (chunks.count(i) == 0) {
168 erasures.insert(i < k ? i : i+nu);
169 }
170 ceph_assert(decoded->count(i) > 0);
171 coded_chunks[i < k ? i : i+nu] = (*decoded)[i];
172 }
173 int chunk_size = coded_chunks[0].length();
174
175 for (int i = k; i < k+nu; i++) {
176 bufferptr buf(buffer::create_aligned(chunk_size, SIMD_ALIGN));
177 buf.zero();
178 coded_chunks[i].push_back(std::move(buf));
179 }
180
181 int res = decode_layered(erasures, &coded_chunks);
182 for (int i = k; i < k+nu; i++) {
183 coded_chunks[i].clear();
184 }
185 return res;
186 }
187
188 int ErasureCodeClay::parse(ErasureCodeProfile &profile,
189 ostream *ss)
190 {
191 int err = 0;
192 err = ErasureCode::parse(profile, ss);
193 err |= to_int("k", profile, &k, DEFAULT_K, ss);
194 err |= to_int("m", profile, &m, DEFAULT_M, ss);
195
196 err |= sanity_check_k_m(k, m, ss);
197
198 err |= to_int("d", profile, &d, std::to_string(k+m-1), ss);
199
200 // check for scalar_mds in profile input
201 if (profile.find("scalar_mds") == profile.end() ||
202 profile.find("scalar_mds")->second.empty()) {
203 mds.profile["plugin"] = "jerasure";
204 pft.profile["plugin"] = "jerasure";
205 } else {
206 std::string p = profile.find("scalar_mds")->second;
207 if ((p == "jerasure") || (p == "isa") || (p == "shec")) {
208 mds.profile["plugin"] = p;
209 pft.profile["plugin"] = p;
210 } else {
211 *ss << "scalar_mds " << mds.profile["plugin"] <<
212 "is not currently supported, use one of 'jerasure',"<<
213 " 'isa', 'shec'" << std::endl;
214 err = -EINVAL;
215 return err;
216 }
217 }
218
219 if (profile.find("technique") == profile.end() ||
220 profile.find("technique")->second.empty()) {
221 if ((mds.profile["plugin"]=="jerasure") || (mds.profile["plugin"]=="isa") ) {
222 mds.profile["technique"] = "reed_sol_van";
223 pft.profile["technique"] = "reed_sol_van";
224 } else {
225 mds.profile["technique"] = "single";
226 pft.profile["technique"] = "single";
227 }
228 } else {
229 std::string p = profile.find("technique")->second;
230 if (mds.profile["plugin"] == "jerasure") {
231 if ( (p == "reed_sol_van") || (p == "reed_sol_r6_op") || (p == "cauchy_orig")
232 || (p == "cauchy_good") || (p == "liber8tion")) {
233 mds.profile["technique"] = p;
234 pft.profile["technique"] = p;
235 } else {
236 *ss << "technique " << p << "is not currently supported, use one of "
237 << "reed_sol_van', 'reed_sol_r6_op','cauchy_orig',"
238 << "'cauchy_good','liber8tion'"<< std::endl;
239 err = -EINVAL;
240 return err;
241 }
242 } else if (mds.profile["plugin"] == "isa") {
243 if ( (p == "reed_sol_van") || (p == "cauchy")) {
244 mds.profile["technique"] = p;
245 pft.profile["technique"] = p;
246 } else {
247 *ss << "technique " << p << "is not currently supported, use one of"
248 << "'reed_sol_van','cauchy'"<< std::endl;
249 err = -EINVAL;
250 return err;
251 }
252 } else {
253 if ( (p == "single") || (p == "multiple")) {
254 mds.profile["technique"] = p;
255 pft.profile["technique"] = p;
256 } else {
257 *ss << "technique " << p << "is not currently supported, use one of"<<
258 "'single','multiple'"<< std::endl;
259 err = -EINVAL;
260 return err;
261 }
262 }
263 }
264 if ((d < k) || (d > k + m - 1)) {
265 *ss << "value of d " << d
266 << " must be within [ " << k << "," << k+m-1 << "]" << std::endl;
267 err = -EINVAL;
268 return err;
269 }
270
271 q = d - k + 1;
272 if ((k + m) % q) {
273 nu = q - (k + m) % q;
274 } else {
275 nu = 0;
276 }
277
278 if (k+m+nu > 254) {
279 err = -EINVAL;
280 return err;
281 }
282
283 if (mds.profile["plugin"] == "shec") {
284 mds.profile["c"] = '2';
285 pft.profile["c"] = '2';
286 }
287 mds.profile["k"] = std::to_string(k+nu);
288 mds.profile["m"] = std::to_string(m);
289 mds.profile["w"] = '8';
290
291 pft.profile["k"] = '2';
292 pft.profile["m"] = '2';
293 pft.profile["w"] = '8';
294
295 t = (k + m + nu) / q;
296 sub_chunk_no = pow_int(q, t);
297
298 dout(10) << __func__
299 << " (q,t,nu)=(" << q << "," << t << "," << nu <<")" << dendl;
300
301 return err;
302 }
303
304 int ErasureCodeClay::is_repair(const set<int> &want_to_read,
305 const set<int> &available_chunks) {
306
307 if (includes(available_chunks.begin(), available_chunks.end(),
308 want_to_read.begin(), want_to_read.end())) return 0;
309 if (want_to_read.size() > 1) return 0;
310
311 int i = *want_to_read.begin();
312 int lost_node_id = (i < k) ? i: i+nu;
313 for (int x = 0; x < q; x++) {
314 int node = (lost_node_id/q)*q+x;
315 node = (node < k) ? node : node-nu;
316 if (node != i) { // node in the same group other than erased node
317 if (available_chunks.count(node) == 0) return 0;
318 }
319 }
320
321 if (available_chunks.size() < (unsigned)d) return 0;
322 return 1;
323 }
324
325 int ErasureCodeClay::minimum_to_repair(const set<int> &want_to_read,
326 const set<int> &available_chunks,
327 map<int, vector<pair<int, int>>> *minimum)
328 {
329 int i = *want_to_read.begin();
330 int lost_node_index = (i < k) ? i : i+nu;
331 int rep_node_index = 0;
332
333 // add all the nodes in lost node's y column.
334 vector<pair<int, int>> sub_chunk_ind;
335 get_repair_subchunks(lost_node_index, sub_chunk_ind);
336 if ((available_chunks.size() >= (unsigned)d)) {
337 for (int j = 0; j < q; j++) {
338 if (j != lost_node_index%q) {
339 rep_node_index = (lost_node_index/q)*q+j;
340 if (rep_node_index < k) {
341 minimum->insert(make_pair(rep_node_index, sub_chunk_ind));
342 } else if (rep_node_index >= k+nu) {
343 minimum->insert(make_pair(rep_node_index-nu, sub_chunk_ind));
344 }
345 }
346 }
347 for (auto chunk : available_chunks) {
348 if (minimum->size() >= (unsigned)d) {
349 break;
350 }
351 if (!minimum->count(chunk)) {
352 minimum->emplace(chunk, sub_chunk_ind);
353 }
354 }
355 } else {
356 dout(0) << "minimum_to_repair: shouldn't have come here" << dendl;
357 ceph_assert(0);
358 }
359 ceph_assert(minimum->size() == (unsigned)d);
360 return 0;
361 }
362
363 void ErasureCodeClay::get_repair_subchunks(const int &lost_node,
364 vector<pair<int, int>> &repair_sub_chunks_ind)
365 {
366 const int y_lost = lost_node / q;
367 const int x_lost = lost_node % q;
368
369 const int seq_sc_count = pow_int(q, t-1-y_lost);
370 const int num_seq = pow_int(q, y_lost);
371
372 int index = x_lost * seq_sc_count;
373 for (int ind_seq = 0; ind_seq < num_seq; ind_seq++) {
374 repair_sub_chunks_ind.push_back(make_pair(index, seq_sc_count));
375 index += q * seq_sc_count;
376 }
377 }
378
379 int ErasureCodeClay::get_repair_sub_chunk_count(const set<int> &want_to_read)
380 {
381 int weight_vector[t];
382 std::fill(weight_vector, weight_vector + t, 0);
383 for (auto to_read : want_to_read) {
384 weight_vector[to_read / q]++;
385 }
386
387 int repair_subchunks_count = 1;
388 for (int y = 0; y < t; y++) {
389 repair_subchunks_count = repair_subchunks_count*(q-weight_vector[y]);
390 }
391
392 return sub_chunk_no - repair_subchunks_count;
393 }
394
395 int ErasureCodeClay::repair(const set<int> &want_to_read,
396 const map<int, bufferlist> &chunks,
397 map<int, bufferlist> *repaired, int chunk_size)
398 {
399
400 ceph_assert((want_to_read.size() == 1) && (chunks.size() == (unsigned)d));
401
402 int repair_sub_chunk_no = get_repair_sub_chunk_count(want_to_read);
403 vector<pair<int, int>> repair_sub_chunks_ind;
404
405 unsigned repair_blocksize = chunks.begin()->second.length();
406 assert(repair_blocksize%repair_sub_chunk_no == 0);
407
408 unsigned sub_chunksize = repair_blocksize/repair_sub_chunk_no;
409 unsigned chunksize = sub_chunk_no*sub_chunksize;
410
411 ceph_assert(chunksize == (unsigned)chunk_size);
412
413 map<int, bufferlist> recovered_data;
414 map<int, bufferlist> helper_data;
415 set<int> aloof_nodes;
416
417 for (int i = 0; i < k + m; i++) {
418 // included helper data only for d+nu nodes.
419 if (auto found = chunks.find(i); found != chunks.end()) { // i is a helper
420 if (i<k) {
421 helper_data[i] = found->second;
422 } else {
423 helper_data[i+nu] = found->second;
424 }
425 } else {
426 if (i != *want_to_read.begin()) { // aloof node case.
427 int aloof_node_id = (i < k) ? i: i+nu;
428 aloof_nodes.insert(aloof_node_id);
429 } else {
430 bufferptr ptr(buffer::create_aligned(chunksize, SIMD_ALIGN));
431 ptr.zero();
432 int lost_node_id = (i < k) ? i : i+nu;
433 (*repaired)[i].push_back(ptr);
434 recovered_data[lost_node_id] = (*repaired)[i];
435 get_repair_subchunks(lost_node_id, repair_sub_chunks_ind);
436 }
437 }
438 }
439
440 // this is for shortened codes i.e., when nu > 0
441 for (int i=k; i < k+nu; i++) {
442 bufferptr ptr(buffer::create_aligned(repair_blocksize, SIMD_ALIGN));
443 ptr.zero();
444 helper_data[i].push_back(ptr);
445 }
446
447 ceph_assert(helper_data.size()+aloof_nodes.size()+recovered_data.size() ==
448 (unsigned) q*t);
449
450 int r = repair_one_lost_chunk(recovered_data, aloof_nodes,
451 helper_data, repair_blocksize,
452 repair_sub_chunks_ind);
453
454 // clear buffers created for the purpose of shortening
455 for (int i = k; i < k+nu; i++) {
456 helper_data[i].clear();
457 }
458
459 return r;
460 }
461
462 int ErasureCodeClay::repair_one_lost_chunk(map<int, bufferlist> &recovered_data,
463 set<int> &aloof_nodes,
464 map<int, bufferlist> &helper_data,
465 int repair_blocksize,
466 vector<pair<int,int>> &repair_sub_chunks_ind)
467 {
468 unsigned repair_subchunks = (unsigned)sub_chunk_no / q;
469 unsigned sub_chunksize = repair_blocksize / repair_subchunks;
470
471 int z_vec[t];
472 map<int, set<int> > ordered_planes;
473 map<int, int> repair_plane_to_ind;
474 int count_retrieved_sub_chunks = 0;
475 int plane_ind = 0;
476
477 bufferptr buf(buffer::create_aligned(sub_chunksize, SIMD_ALIGN));
478 bufferlist temp_buf;
479 temp_buf.push_back(buf);
480
481 for (auto [index,count] : repair_sub_chunks_ind) {
482 for (int j = index; j < index + count; j++) {
483 get_plane_vector(j, z_vec);
484 int order = 0;
485 // check across all erasures and aloof nodes
486 for ([[maybe_unused]] auto& [node, bl] : recovered_data) {
487 if (node % q == z_vec[node / q]) order++;
488 (void)bl; // silence -Wunused-variable
489 }
490 for (auto node : aloof_nodes) {
491 if (node % q == z_vec[node / q]) order++;
492 }
493 ceph_assert(order > 0);
494 ordered_planes[order].insert(j);
495 // to keep track of a sub chunk within helper buffer recieved
496 repair_plane_to_ind[j] = plane_ind;
497 plane_ind++;
498 }
499 }
500 assert((unsigned)plane_ind == repair_subchunks);
501
502 int plane_count = 0;
503 for (int i = 0; i < q*t; i++) {
504 if (U_buf[i].length() == 0) {
505 bufferptr buf(buffer::create_aligned(sub_chunk_no*sub_chunksize, SIMD_ALIGN));
506 buf.zero();
507 U_buf[i].push_back(std::move(buf));
508 }
509 }
510
511 int lost_chunk;
512 int count = 0;
513 for ([[maybe_unused]] auto& [node, bl] : recovered_data) {
514 lost_chunk = node;
515 count++;
516 (void)bl; // silence -Wunused-variable
517 }
518 ceph_assert(count == 1);
519
520 set<int> erasures;
521 for (int i = 0; i < q; i++) {
522 erasures.insert(lost_chunk - lost_chunk % q + i);
523 }
524 for (auto node : aloof_nodes) {
525 erasures.insert(node);
526 }
527
528 for (int order = 1; ;order++) {
529 if (ordered_planes.count(order) == 0) {
530 break;
531 }
532 plane_count += ordered_planes[order].size();
533 for (auto z : ordered_planes[order]) {
534 get_plane_vector(z, z_vec);
535
536 for (int y = 0; y < t; y++) {
537 for (int x = 0; x < q; x++) {
538 int node_xy = y*q + x;
539 map<int, bufferlist> known_subchunks;
540 map<int, bufferlist> pftsubchunks;
541 set<int> pft_erasures;
542 if (erasures.count(node_xy) == 0) {
543 assert(helper_data.count(node_xy) > 0);
544 int z_sw = z + (x - z_vec[y])*pow_int(q,t-1-y);
545 int node_sw = y*q + z_vec[y];
546 int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
547 if (z_vec[y] > x) {
548 i0 = 1;
549 i1 = 0;
550 i2 = 3;
551 i3 = 2;
552 }
553 if (aloof_nodes.count(node_sw) > 0) {
554 assert(repair_plane_to_ind.count(z) > 0);
555 assert(repair_plane_to_ind.count(z_sw) > 0);
556 pft_erasures.insert(i2);
557 known_subchunks[i0].substr_of(helper_data[node_xy], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize);
558 known_subchunks[i3].substr_of(U_buf[node_sw], z_sw*sub_chunksize, sub_chunksize);
559 pftsubchunks[i0] = known_subchunks[i0];
560 pftsubchunks[i1] = temp_buf;
561 pftsubchunks[i2].substr_of(U_buf[node_xy], z*sub_chunksize, sub_chunksize);
562 pftsubchunks[i3] = known_subchunks[i3];
563 for (int i=0; i<3; i++) {
564 pftsubchunks[i].rebuild_aligned(SIMD_ALIGN);
565 }
566 pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks);
567 } else {
568 ceph_assert(helper_data.count(node_sw) > 0);
569 ceph_assert(repair_plane_to_ind.count(z) > 0);
570 if (z_vec[y] != x){
571 pft_erasures.insert(i2);
572 ceph_assert(repair_plane_to_ind.count(z_sw) > 0);
573 known_subchunks[i0].substr_of(helper_data[node_xy], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize);
574 known_subchunks[i1].substr_of(helper_data[node_sw], repair_plane_to_ind[z_sw]*sub_chunksize, sub_chunksize);
575 pftsubchunks[i0] = known_subchunks[i0];
576 pftsubchunks[i1] = known_subchunks[i1];
577 pftsubchunks[i2].substr_of(U_buf[node_xy], z*sub_chunksize, sub_chunksize);
578 pftsubchunks[i3].substr_of(temp_buf, 0, sub_chunksize);
579 for (int i=0; i<3; i++) {
580 pftsubchunks[i].rebuild_aligned(SIMD_ALIGN);
581 }
582 pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks);
583 } else {
584 char* uncoupled_chunk = U_buf[node_xy].c_str();
585 char* coupled_chunk = helper_data[node_xy].c_str();
586 memcpy(&uncoupled_chunk[z*sub_chunksize],
587 &coupled_chunk[repair_plane_to_ind[z]*sub_chunksize],
588 sub_chunksize);
589 }
590 }
591 }
592 } // x
593 } // y
594 ceph_assert(erasures.size() <= (unsigned)m);
595 decode_uncoupled(erasures, z, sub_chunksize);
596
597 for (auto i : erasures) {
598 int x = i % q;
599 int y = i / q;
600 int node_sw = y*q+z_vec[y];
601 int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
602 set<int> pft_erasures;
603 map<int, bufferlist> known_subchunks;
604 map<int, bufferlist> pftsubchunks;
605 int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
606 if (z_vec[y] > x) {
607 i0 = 1;
608 i1 = 0;
609 i2 = 3;
610 i3 = 2;
611 }
612 // make sure it is not an aloof node before you retrieve repaired_data
613 if (aloof_nodes.count(i) == 0) {
614 if (x == z_vec[y]) { // hole-dot pair (type 0)
615 char* coupled_chunk = recovered_data[i].c_str();
616 char* uncoupled_chunk = U_buf[i].c_str();
617 memcpy(&coupled_chunk[z*sub_chunksize],
618 &uncoupled_chunk[z*sub_chunksize],
619 sub_chunksize);
620 count_retrieved_sub_chunks++;
621 } else {
622 ceph_assert(y == lost_chunk / q);
623 ceph_assert(node_sw == lost_chunk);
624 ceph_assert(helper_data.count(i) > 0);
625 pft_erasures.insert(i1);
626 known_subchunks[i0].substr_of(helper_data[i], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize);
627 known_subchunks[i2].substr_of(U_buf[i], z*sub_chunksize, sub_chunksize);
628
629 pftsubchunks[i0] = known_subchunks[i0];
630 pftsubchunks[i1].substr_of(recovered_data[node_sw], z_sw*sub_chunksize, sub_chunksize);
631 pftsubchunks[i2] = known_subchunks[i2];
632 pftsubchunks[i3] = temp_buf;
633 for (int i=0; i<3; i++) {
634 pftsubchunks[i].rebuild_aligned(SIMD_ALIGN);
635 }
636 pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks);
637 }
638 }
639 } // recover all erasures
640 } // planes of particular order
641 } // order
642
643 return 0;
644 }
645
646
647 int ErasureCodeClay::decode_layered(set<int> &erased_chunks,
648 map<int, bufferlist> *chunks)
649 {
650 int num_erasures = erased_chunks.size();
651
652 int size = (*chunks)[0].length();
653 ceph_assert(size%sub_chunk_no == 0);
654 int sc_size = size / sub_chunk_no;
655
656 ceph_assert(num_erasures > 0);
657
658 for (int i = k+nu; (num_erasures < m) && (i < q*t); i++) {
659 if ([[maybe_unused]] auto [it, added] = erased_chunks.emplace(i); added) {
660 num_erasures++;
661 (void)it; // silence -Wunused-variable
662 }
663 }
664 ceph_assert(num_erasures == m);
665
666 int max_iscore = get_max_iscore(erased_chunks);
667 int order[sub_chunk_no];
668 int z_vec[t];
669 for (int i = 0; i < q*t; i++) {
670 if (U_buf[i].length() == 0) {
671 bufferptr buf(buffer::create_aligned(size, SIMD_ALIGN));
672 buf.zero();
673 U_buf[i].push_back(std::move(buf));
674 }
675 }
676
677 set_planes_sequential_decoding_order(order, erased_chunks);
678
679 for (int iscore = 0; iscore <= max_iscore; iscore++) {
680 for (int z = 0; z < sub_chunk_no; z++) {
681 if (order[z] == iscore) {
682 decode_erasures(erased_chunks, z, chunks, sc_size);
683 }
684 }
685
686 for (int z = 0; z < sub_chunk_no; z++) {
687 if (order[z] == iscore) {
688 get_plane_vector(z, z_vec);
689 for (auto node_xy : erased_chunks) {
690 int x = node_xy % q;
691 int y = node_xy / q;
692 int node_sw = y*q+z_vec[y];
693 if (z_vec[y] != x) {
694 if (erased_chunks.count(node_sw) == 0) {
695 recover_type1_erasure(chunks, x, y, z, z_vec, sc_size);
696 } else if (z_vec[y] < x){
697 ceph_assert(erased_chunks.count(node_sw) > 0);
698 ceph_assert(z_vec[y] != x);
699 get_coupled_from_uncoupled(chunks, x, y, z, z_vec, sc_size);
700 }
701 } else {
702 char* C = (*chunks)[node_xy].c_str();
703 char* U = U_buf[node_xy].c_str();
704 memcpy(&C[z*sc_size], &U[z*sc_size], sc_size);
705 }
706 }
707 }
708 } // plane
709 } // iscore, order
710
711 return 0;
712 }
713
714 int ErasureCodeClay::decode_erasures(const set<int>& erased_chunks, int z,
715 map<int, bufferlist>* chunks, int sc_size)
716 {
717 int z_vec[t];
718
719 get_plane_vector(z, z_vec);
720
721 for (int x = 0; x < q; x++) {
722 for (int y = 0; y < t; y++) {
723 int node_xy = q*y+x;
724 int node_sw = q*y+z_vec[y];
725 if (erased_chunks.count(node_xy) == 0) {
726 if (z_vec[y] < x) {
727 get_uncoupled_from_coupled(chunks, x, y, z, z_vec, sc_size);
728 } else if (z_vec[y] == x) {
729 char* uncoupled_chunk = U_buf[node_xy].c_str();
730 char* coupled_chunk = (*chunks)[node_xy].c_str();
731 memcpy(&uncoupled_chunk[z*sc_size], &coupled_chunk[z*sc_size], sc_size);
732 } else {
733 if (erased_chunks.count(node_sw) > 0) {
734 get_uncoupled_from_coupled(chunks, x, y, z, z_vec, sc_size);
735 }
736 }
737 }
738 }
739 }
740 return decode_uncoupled(erased_chunks, z, sc_size);
741 }
742
743 int ErasureCodeClay::decode_uncoupled(const set<int>& erased_chunks, int z, int sc_size)
744 {
745 map<int, bufferlist> known_subchunks;
746 map<int, bufferlist> all_subchunks;
747
748 for (int i = 0; i < q*t; i++) {
749 if (erased_chunks.count(i) == 0) {
750 known_subchunks[i].substr_of(U_buf[i], z*sc_size, sc_size);
751 all_subchunks[i] = known_subchunks[i];
752 } else {
753 all_subchunks[i].substr_of(U_buf[i], z*sc_size, sc_size);
754 }
755 all_subchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
756 assert(all_subchunks[i].is_contiguous());
757 }
758
759 mds.erasure_code->decode_chunks(erased_chunks, known_subchunks, &all_subchunks);
760 return 0;
761 }
762
763 void ErasureCodeClay::set_planes_sequential_decoding_order(int* order, set<int>& erasures) {
764 int z_vec[t];
765 for (int z = 0; z < sub_chunk_no; z++) {
766 get_plane_vector(z,z_vec);
767 order[z] = 0;
768 for (auto i : erasures) {
769 if (i % q == z_vec[i / q]) {
770 order[z] = order[z] + 1;
771 }
772 }
773 }
774 }
775
776 void ErasureCodeClay::recover_type1_erasure(map<int, bufferlist>* chunks,
777 int x, int y, int z,
778 int* z_vec, int sc_size)
779 {
780 set<int> erased_chunks;
781
782 int node_xy = y*q+x;
783 int node_sw = y*q+z_vec[y];
784 int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
785
786 map<int, bufferlist> known_subchunks;
787 map<int, bufferlist> pftsubchunks;
788 bufferptr ptr(buffer::create_aligned(sc_size, SIMD_ALIGN));
789 ptr.zero();
790
791 int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
792 if (z_vec[y] > x) {
793 i0 = 1;
794 i1 = 0;
795 i2 = 3;
796 i3 = 2;
797 }
798
799 erased_chunks.insert(i0);
800 pftsubchunks[i0].substr_of((*chunks)[node_xy], z * sc_size, sc_size);
801 known_subchunks[i1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size);
802 known_subchunks[i2].substr_of(U_buf[node_xy], z * sc_size, sc_size);
803 pftsubchunks[i1] = known_subchunks[i1];
804 pftsubchunks[i2] = known_subchunks[i2];
805 pftsubchunks[i3].push_back(ptr);
806
807 for (int i=0; i<3; i++) {
808 pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
809 }
810
811 pft.erasure_code->decode_chunks(erased_chunks, known_subchunks, &pftsubchunks);
812 }
813
814 void ErasureCodeClay::get_coupled_from_uncoupled(map<int, bufferlist>* chunks,
815 int x, int y, int z,
816 int* z_vec, int sc_size)
817 {
818 set<int> erased_chunks = {0, 1};
819
820 int node_xy = y*q+x;
821 int node_sw = y*q+z_vec[y];
822 int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
823
824 ceph_assert(z_vec[y] < x);
825 map<int, bufferlist> uncoupled_subchunks;
826 uncoupled_subchunks[2].substr_of(U_buf[node_xy], z * sc_size, sc_size);
827 uncoupled_subchunks[3].substr_of(U_buf[node_sw], z_sw * sc_size, sc_size);
828
829 map<int, bufferlist> pftsubchunks;
830 pftsubchunks[0].substr_of((*chunks)[node_xy], z * sc_size, sc_size);
831 pftsubchunks[1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size);
832 pftsubchunks[2] = uncoupled_subchunks[2];
833 pftsubchunks[3] = uncoupled_subchunks[3];
834
835 for (int i=0; i<3; i++) {
836 pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
837 }
838 pft.erasure_code->decode_chunks(erased_chunks, uncoupled_subchunks, &pftsubchunks);
839 }
840
841 void ErasureCodeClay::get_uncoupled_from_coupled(map<int, bufferlist>* chunks,
842 int x, int y, int z,
843 int* z_vec, int sc_size)
844 {
845 set<int> erased_chunks = {2, 3};
846
847 int node_xy = y*q+x;
848 int node_sw = y*q+z_vec[y];
849 int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
850
851 int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
852 if (z_vec[y] > x) {
853 i0 = 1;
854 i1 = 0;
855 i2 = 3;
856 i3 = 2;
857 }
858 map<int, bufferlist> coupled_subchunks;
859 coupled_subchunks[i0].substr_of((*chunks)[node_xy], z * sc_size, sc_size);
860 coupled_subchunks[i1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size);
861
862 map<int, bufferlist> pftsubchunks;
863 pftsubchunks[0] = coupled_subchunks[0];
864 pftsubchunks[1] = coupled_subchunks[1];
865 pftsubchunks[i2].substr_of(U_buf[node_xy], z * sc_size, sc_size);
866 pftsubchunks[i3].substr_of(U_buf[node_sw], z_sw * sc_size, sc_size);
867 for (int i=0; i<3; i++) {
868 pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
869 }
870 pft.erasure_code->decode_chunks(erased_chunks, coupled_subchunks, &pftsubchunks);
871 }
872
873 int ErasureCodeClay::get_max_iscore(set<int>& erased_chunks)
874 {
875 int weight_vec[t];
876 int iscore = 0;
877 memset(weight_vec, 0, sizeof(int)*t);
878
879 for (auto i : erased_chunks) {
880 if (weight_vec[i / q] == 0) {
881 weight_vec[i / q] = 1;
882 iscore++;
883 }
884 }
885 return iscore;
886 }
887
888 void ErasureCodeClay::get_plane_vector(int z, int* z_vec)
889 {
890 for (int i = 0; i < t; i++) {
891 z_vec[t-1-i] = z % q;
892 z = (z - z_vec[t-1-i]) / q;
893 }
894 }