]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2018 Indian Institute of Science <office.ece@iisc.ac.in> | |
7 | * | |
8 | * Author: Myna Vajha <mynaramana@gmail.com> | |
9 | * | |
10 | * This library is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License as published by the Free Software Foundation; either | |
13 | * version 2.1 of the License, or (at your option) any later version. | |
14 | * | |
15 | */ | |
16 | ||
17 | #include <errno.h> | |
18 | #include <algorithm> | |
19 | ||
20 | #include "ErasureCodeClay.h" | |
21 | ||
22 | #include "common/debug.h" | |
23 | #include "erasure-code/ErasureCodePlugin.h" | |
24 | #include "include/ceph_assert.h" | |
25 | #include "include/str_map.h" | |
26 | #include "include/stringify.h" | |
27 | #include "osd/osd_types.h" | |
28 | ||
29 | ||
30 | #define dout_context g_ceph_context | |
31 | #define dout_subsys ceph_subsys_osd | |
32 | #undef dout_prefix | |
33 | #define dout_prefix _prefix(_dout) | |
34 | ||
35 | #define LARGEST_VECTOR_WORDSIZE 16 | |
36 | #define talloc(type, num) (type *) malloc(sizeof(type)*(num)) | |
37 | ||
38 | using namespace std; | |
9f95a23c | 39 | using namespace ceph; |
11fdf7f2 TL |
40 | static ostream& _prefix(std::ostream* _dout) |
41 | { | |
42 | return *_dout << "ErasureCodeClay: "; | |
43 | } | |
44 | ||
45 | static int pow_int(int a, int x) { | |
46 | int power = 1; | |
47 | while (x) { | |
48 | if (x & 1) power *= a; | |
49 | x /= 2; | |
50 | a *= a; | |
51 | } | |
52 | return power; | |
53 | } | |
54 | ||
55 | ErasureCodeClay::~ErasureCodeClay() | |
56 | { | |
57 | for (int i = 0; i < q*t; i++) { | |
58 | if (U_buf[i].length() != 0) U_buf[i].clear(); | |
59 | } | |
60 | } | |
61 | ||
62 | int ErasureCodeClay::init(ErasureCodeProfile &profile, | |
63 | ostream *ss) | |
64 | { | |
65 | int r; | |
66 | r = parse(profile, ss); | |
67 | if (r) | |
68 | return r; | |
69 | ||
70 | r = ErasureCode::init(profile, ss); | |
71 | if (r) | |
72 | return r; | |
73 | ErasureCodePluginRegistry ®istry = ErasureCodePluginRegistry::instance(); | |
74 | r = registry.factory(mds.profile["plugin"], | |
75 | directory, | |
76 | mds.profile, | |
77 | &mds.erasure_code, | |
78 | ss); | |
79 | if (r) | |
80 | return r; | |
81 | r = registry.factory(pft.profile["plugin"], | |
82 | directory, | |
83 | pft.profile, | |
84 | &pft.erasure_code, | |
85 | ss); | |
86 | return r; | |
87 | ||
88 | } | |
89 | ||
90 | unsigned int ErasureCodeClay::get_chunk_size(unsigned int object_size) const | |
91 | { | |
92 | unsigned int alignment_scalar_code = pft.erasure_code->get_chunk_size(1); | |
93 | unsigned int alignment = sub_chunk_no * k * alignment_scalar_code; | |
94 | ||
95 | return round_up_to(object_size, alignment) / k; | |
96 | } | |
97 | ||
98 | int ErasureCodeClay::minimum_to_decode(const set<int> &want_to_read, | |
99 | const set<int> &available, | |
100 | map<int, vector<pair<int, int>>> *minimum) | |
101 | { | |
102 | if (is_repair(want_to_read, available)) { | |
103 | return minimum_to_repair(want_to_read, available, minimum); | |
104 | } else { | |
105 | return ErasureCode::minimum_to_decode(want_to_read, available, minimum); | |
106 | } | |
107 | } | |
108 | ||
109 | int ErasureCodeClay::decode(const set<int> &want_to_read, | |
110 | const map<int, bufferlist> &chunks, | |
111 | map<int, bufferlist> *decoded, int chunk_size) | |
112 | { | |
113 | set<int> avail; | |
114 | for ([[maybe_unused]] auto& [node, bl] : chunks) { | |
115 | avail.insert(node); | |
116 | (void)bl; // silence -Wunused-variable | |
117 | } | |
118 | ||
119 | if (is_repair(want_to_read, avail) && | |
120 | ((unsigned int)chunk_size > chunks.begin()->second.length())) { | |
121 | return repair(want_to_read, chunks, decoded, chunk_size); | |
122 | } else { | |
123 | return ErasureCode::_decode(want_to_read, chunks, decoded); | |
124 | } | |
125 | } | |
126 | ||
127 | void p(const set<int> &s) { cerr << s; } // for gdb | |
128 | ||
129 | int ErasureCodeClay::encode_chunks(const set<int> &want_to_encode, | |
130 | map<int, bufferlist> *encoded) | |
131 | { | |
132 | map<int, bufferlist> chunks; | |
133 | set<int> parity_chunks; | |
134 | int chunk_size = (*encoded)[0].length(); | |
135 | ||
136 | for (int i = 0; i < k + m; i++) { | |
137 | if (i < k) { | |
138 | chunks[i] = (*encoded)[i]; | |
139 | } else { | |
140 | chunks[i+nu] = (*encoded)[i]; | |
141 | parity_chunks.insert(i+nu); | |
142 | } | |
143 | } | |
144 | ||
145 | for (int i = k; i < k + nu; i++) { | |
146 | bufferptr buf(buffer::create_aligned(chunk_size, SIMD_ALIGN)); | |
147 | buf.zero(); | |
148 | chunks[i].push_back(std::move(buf)); | |
149 | } | |
150 | ||
151 | int res = decode_layered(parity_chunks, &chunks); | |
152 | for (int i = k ; i < k + nu; i++) { | |
153 | // need to clean some of the intermediate chunks here!! | |
154 | chunks[i].clear(); | |
155 | } | |
156 | return res; | |
157 | } | |
158 | ||
159 | int ErasureCodeClay::decode_chunks(const set<int> &want_to_read, | |
160 | const map<int, bufferlist> &chunks, | |
161 | map<int, bufferlist> *decoded) | |
162 | { | |
163 | set<int> erasures; | |
164 | map<int, bufferlist> coded_chunks; | |
165 | ||
166 | for (int i = 0; i < k + m; i++) { | |
167 | if (chunks.count(i) == 0) { | |
168 | erasures.insert(i < k ? i : i+nu); | |
169 | } | |
170 | ceph_assert(decoded->count(i) > 0); | |
171 | coded_chunks[i < k ? i : i+nu] = (*decoded)[i]; | |
172 | } | |
173 | int chunk_size = coded_chunks[0].length(); | |
174 | ||
175 | for (int i = k; i < k+nu; i++) { | |
176 | bufferptr buf(buffer::create_aligned(chunk_size, SIMD_ALIGN)); | |
177 | buf.zero(); | |
178 | coded_chunks[i].push_back(std::move(buf)); | |
179 | } | |
180 | ||
181 | int res = decode_layered(erasures, &coded_chunks); | |
182 | for (int i = k; i < k+nu; i++) { | |
183 | coded_chunks[i].clear(); | |
184 | } | |
185 | return res; | |
186 | } | |
187 | ||
188 | int ErasureCodeClay::parse(ErasureCodeProfile &profile, | |
189 | ostream *ss) | |
190 | { | |
191 | int err = 0; | |
192 | err = ErasureCode::parse(profile, ss); | |
193 | err |= to_int("k", profile, &k, DEFAULT_K, ss); | |
194 | err |= to_int("m", profile, &m, DEFAULT_M, ss); | |
195 | ||
196 | err |= sanity_check_k_m(k, m, ss); | |
197 | ||
198 | err |= to_int("d", profile, &d, std::to_string(k+m-1), ss); | |
199 | ||
200 | // check for scalar_mds in profile input | |
201 | if (profile.find("scalar_mds") == profile.end() || | |
202 | profile.find("scalar_mds")->second.empty()) { | |
203 | mds.profile["plugin"] = "jerasure"; | |
204 | pft.profile["plugin"] = "jerasure"; | |
205 | } else { | |
206 | std::string p = profile.find("scalar_mds")->second; | |
207 | if ((p == "jerasure") || (p == "isa") || (p == "shec")) { | |
208 | mds.profile["plugin"] = p; | |
209 | pft.profile["plugin"] = p; | |
210 | } else { | |
211 | *ss << "scalar_mds " << mds.profile["plugin"] << | |
212 | "is not currently supported, use one of 'jerasure',"<< | |
213 | " 'isa', 'shec'" << std::endl; | |
214 | err = -EINVAL; | |
215 | return err; | |
216 | } | |
217 | } | |
218 | ||
219 | if (profile.find("technique") == profile.end() || | |
220 | profile.find("technique")->second.empty()) { | |
221 | if ((mds.profile["plugin"]=="jerasure") || (mds.profile["plugin"]=="isa") ) { | |
222 | mds.profile["technique"] = "reed_sol_van"; | |
223 | pft.profile["technique"] = "reed_sol_van"; | |
224 | } else { | |
225 | mds.profile["technique"] = "single"; | |
226 | pft.profile["technique"] = "single"; | |
227 | } | |
228 | } else { | |
229 | std::string p = profile.find("technique")->second; | |
230 | if (mds.profile["plugin"] == "jerasure") { | |
231 | if ( (p == "reed_sol_van") || (p == "reed_sol_r6_op") || (p == "cauchy_orig") | |
232 | || (p == "cauchy_good") || (p == "liber8tion")) { | |
233 | mds.profile["technique"] = p; | |
234 | pft.profile["technique"] = p; | |
235 | } else { | |
236 | *ss << "technique " << p << "is not currently supported, use one of " | |
237 | << "reed_sol_van', 'reed_sol_r6_op','cauchy_orig'," | |
238 | << "'cauchy_good','liber8tion'"<< std::endl; | |
239 | err = -EINVAL; | |
240 | return err; | |
241 | } | |
242 | } else if (mds.profile["plugin"] == "isa") { | |
243 | if ( (p == "reed_sol_van") || (p == "cauchy")) { | |
244 | mds.profile["technique"] = p; | |
245 | pft.profile["technique"] = p; | |
246 | } else { | |
247 | *ss << "technique " << p << "is not currently supported, use one of" | |
248 | << "'reed_sol_van','cauchy'"<< std::endl; | |
249 | err = -EINVAL; | |
250 | return err; | |
251 | } | |
252 | } else { | |
253 | if ( (p == "single") || (p == "multiple")) { | |
254 | mds.profile["technique"] = p; | |
255 | pft.profile["technique"] = p; | |
256 | } else { | |
257 | *ss << "technique " << p << "is not currently supported, use one of"<< | |
258 | "'single','multiple'"<< std::endl; | |
259 | err = -EINVAL; | |
260 | return err; | |
261 | } | |
262 | } | |
263 | } | |
264 | if ((d < k) || (d > k + m - 1)) { | |
265 | *ss << "value of d " << d | |
266 | << " must be within [ " << k << "," << k+m-1 << "]" << std::endl; | |
267 | err = -EINVAL; | |
268 | return err; | |
269 | } | |
270 | ||
271 | q = d - k + 1; | |
272 | if ((k + m) % q) { | |
273 | nu = q - (k + m) % q; | |
274 | } else { | |
275 | nu = 0; | |
276 | } | |
277 | ||
278 | if (k+m+nu > 254) { | |
279 | err = -EINVAL; | |
280 | return err; | |
281 | } | |
282 | ||
283 | if (mds.profile["plugin"] == "shec") { | |
284 | mds.profile["c"] = '2'; | |
285 | pft.profile["c"] = '2'; | |
286 | } | |
287 | mds.profile["k"] = std::to_string(k+nu); | |
288 | mds.profile["m"] = std::to_string(m); | |
289 | mds.profile["w"] = '8'; | |
290 | ||
291 | pft.profile["k"] = '2'; | |
292 | pft.profile["m"] = '2'; | |
293 | pft.profile["w"] = '8'; | |
294 | ||
295 | t = (k + m + nu) / q; | |
296 | sub_chunk_no = pow_int(q, t); | |
297 | ||
298 | dout(10) << __func__ | |
299 | << " (q,t,nu)=(" << q << "," << t << "," << nu <<")" << dendl; | |
300 | ||
301 | return err; | |
302 | } | |
303 | ||
304 | int ErasureCodeClay::is_repair(const set<int> &want_to_read, | |
305 | const set<int> &available_chunks) { | |
306 | ||
307 | if (includes(available_chunks.begin(), available_chunks.end(), | |
308 | want_to_read.begin(), want_to_read.end())) return 0; | |
309 | if (want_to_read.size() > 1) return 0; | |
310 | ||
311 | int i = *want_to_read.begin(); | |
312 | int lost_node_id = (i < k) ? i: i+nu; | |
313 | for (int x = 0; x < q; x++) { | |
314 | int node = (lost_node_id/q)*q+x; | |
315 | node = (node < k) ? node : node-nu; | |
316 | if (node != i) { // node in the same group other than erased node | |
317 | if (available_chunks.count(node) == 0) return 0; | |
318 | } | |
319 | } | |
320 | ||
321 | if (available_chunks.size() < (unsigned)d) return 0; | |
322 | return 1; | |
323 | } | |
324 | ||
325 | int ErasureCodeClay::minimum_to_repair(const set<int> &want_to_read, | |
326 | const set<int> &available_chunks, | |
327 | map<int, vector<pair<int, int>>> *minimum) | |
328 | { | |
329 | int i = *want_to_read.begin(); | |
330 | int lost_node_index = (i < k) ? i : i+nu; | |
331 | int rep_node_index = 0; | |
332 | ||
333 | // add all the nodes in lost node's y column. | |
334 | vector<pair<int, int>> sub_chunk_ind; | |
335 | get_repair_subchunks(lost_node_index, sub_chunk_ind); | |
336 | if ((available_chunks.size() >= (unsigned)d)) { | |
337 | for (int j = 0; j < q; j++) { | |
338 | if (j != lost_node_index%q) { | |
339 | rep_node_index = (lost_node_index/q)*q+j; | |
340 | if (rep_node_index < k) { | |
341 | minimum->insert(make_pair(rep_node_index, sub_chunk_ind)); | |
342 | } else if (rep_node_index >= k+nu) { | |
343 | minimum->insert(make_pair(rep_node_index-nu, sub_chunk_ind)); | |
344 | } | |
345 | } | |
346 | } | |
347 | for (auto chunk : available_chunks) { | |
348 | if (minimum->size() >= (unsigned)d) { | |
349 | break; | |
350 | } | |
351 | if (!minimum->count(chunk)) { | |
352 | minimum->emplace(chunk, sub_chunk_ind); | |
353 | } | |
354 | } | |
355 | } else { | |
356 | dout(0) << "minimum_to_repair: shouldn't have come here" << dendl; | |
357 | ceph_assert(0); | |
358 | } | |
359 | ceph_assert(minimum->size() == (unsigned)d); | |
360 | return 0; | |
361 | } | |
362 | ||
363 | void ErasureCodeClay::get_repair_subchunks(const int &lost_node, | |
364 | vector<pair<int, int>> &repair_sub_chunks_ind) | |
365 | { | |
366 | const int y_lost = lost_node / q; | |
367 | const int x_lost = lost_node % q; | |
368 | ||
369 | const int seq_sc_count = pow_int(q, t-1-y_lost); | |
370 | const int num_seq = pow_int(q, y_lost); | |
371 | ||
372 | int index = x_lost * seq_sc_count; | |
373 | for (int ind_seq = 0; ind_seq < num_seq; ind_seq++) { | |
374 | repair_sub_chunks_ind.push_back(make_pair(index, seq_sc_count)); | |
375 | index += q * seq_sc_count; | |
376 | } | |
377 | } | |
378 | ||
379 | int ErasureCodeClay::get_repair_sub_chunk_count(const set<int> &want_to_read) | |
380 | { | |
381 | int weight_vector[t]; | |
382 | std::fill(weight_vector, weight_vector + t, 0); | |
383 | for (auto to_read : want_to_read) { | |
384 | weight_vector[to_read / q]++; | |
385 | } | |
386 | ||
387 | int repair_subchunks_count = 1; | |
388 | for (int y = 0; y < t; y++) { | |
389 | repair_subchunks_count = repair_subchunks_count*(q-weight_vector[y]); | |
390 | } | |
391 | ||
392 | return sub_chunk_no - repair_subchunks_count; | |
393 | } | |
394 | ||
395 | int ErasureCodeClay::repair(const set<int> &want_to_read, | |
396 | const map<int, bufferlist> &chunks, | |
397 | map<int, bufferlist> *repaired, int chunk_size) | |
398 | { | |
399 | ||
400 | ceph_assert((want_to_read.size() == 1) && (chunks.size() == (unsigned)d)); | |
401 | ||
402 | int repair_sub_chunk_no = get_repair_sub_chunk_count(want_to_read); | |
403 | vector<pair<int, int>> repair_sub_chunks_ind; | |
404 | ||
405 | unsigned repair_blocksize = chunks.begin()->second.length(); | |
406 | assert(repair_blocksize%repair_sub_chunk_no == 0); | |
407 | ||
408 | unsigned sub_chunksize = repair_blocksize/repair_sub_chunk_no; | |
409 | unsigned chunksize = sub_chunk_no*sub_chunksize; | |
410 | ||
411 | ceph_assert(chunksize == (unsigned)chunk_size); | |
412 | ||
413 | map<int, bufferlist> recovered_data; | |
414 | map<int, bufferlist> helper_data; | |
415 | set<int> aloof_nodes; | |
416 | ||
417 | for (int i = 0; i < k + m; i++) { | |
418 | // included helper data only for d+nu nodes. | |
419 | if (auto found = chunks.find(i); found != chunks.end()) { // i is a helper | |
420 | if (i<k) { | |
421 | helper_data[i] = found->second; | |
422 | } else { | |
423 | helper_data[i+nu] = found->second; | |
424 | } | |
425 | } else { | |
426 | if (i != *want_to_read.begin()) { // aloof node case. | |
427 | int aloof_node_id = (i < k) ? i: i+nu; | |
428 | aloof_nodes.insert(aloof_node_id); | |
429 | } else { | |
430 | bufferptr ptr(buffer::create_aligned(chunksize, SIMD_ALIGN)); | |
431 | ptr.zero(); | |
432 | int lost_node_id = (i < k) ? i : i+nu; | |
433 | (*repaired)[i].push_back(ptr); | |
434 | recovered_data[lost_node_id] = (*repaired)[i]; | |
435 | get_repair_subchunks(lost_node_id, repair_sub_chunks_ind); | |
436 | } | |
437 | } | |
438 | } | |
439 | ||
440 | // this is for shortened codes i.e., when nu > 0 | |
441 | for (int i=k; i < k+nu; i++) { | |
442 | bufferptr ptr(buffer::create_aligned(repair_blocksize, SIMD_ALIGN)); | |
443 | ptr.zero(); | |
444 | helper_data[i].push_back(ptr); | |
445 | } | |
446 | ||
447 | ceph_assert(helper_data.size()+aloof_nodes.size()+recovered_data.size() == | |
448 | (unsigned) q*t); | |
449 | ||
450 | int r = repair_one_lost_chunk(recovered_data, aloof_nodes, | |
451 | helper_data, repair_blocksize, | |
452 | repair_sub_chunks_ind); | |
453 | ||
454 | // clear buffers created for the purpose of shortening | |
455 | for (int i = k; i < k+nu; i++) { | |
456 | helper_data[i].clear(); | |
457 | } | |
458 | ||
459 | return r; | |
460 | } | |
461 | ||
462 | int ErasureCodeClay::repair_one_lost_chunk(map<int, bufferlist> &recovered_data, | |
463 | set<int> &aloof_nodes, | |
464 | map<int, bufferlist> &helper_data, | |
465 | int repair_blocksize, | |
466 | vector<pair<int,int>> &repair_sub_chunks_ind) | |
467 | { | |
468 | unsigned repair_subchunks = (unsigned)sub_chunk_no / q; | |
469 | unsigned sub_chunksize = repair_blocksize / repair_subchunks; | |
470 | ||
471 | int z_vec[t]; | |
472 | map<int, set<int> > ordered_planes; | |
473 | map<int, int> repair_plane_to_ind; | |
474 | int count_retrieved_sub_chunks = 0; | |
475 | int plane_ind = 0; | |
476 | ||
477 | bufferptr buf(buffer::create_aligned(sub_chunksize, SIMD_ALIGN)); | |
478 | bufferlist temp_buf; | |
479 | temp_buf.push_back(buf); | |
480 | ||
481 | for (auto [index,count] : repair_sub_chunks_ind) { | |
482 | for (int j = index; j < index + count; j++) { | |
483 | get_plane_vector(j, z_vec); | |
484 | int order = 0; | |
485 | // check across all erasures and aloof nodes | |
486 | for ([[maybe_unused]] auto& [node, bl] : recovered_data) { | |
487 | if (node % q == z_vec[node / q]) order++; | |
488 | (void)bl; // silence -Wunused-variable | |
489 | } | |
490 | for (auto node : aloof_nodes) { | |
491 | if (node % q == z_vec[node / q]) order++; | |
492 | } | |
493 | ceph_assert(order > 0); | |
494 | ordered_planes[order].insert(j); | |
495 | // to keep track of a sub chunk within helper buffer recieved | |
496 | repair_plane_to_ind[j] = plane_ind; | |
497 | plane_ind++; | |
498 | } | |
499 | } | |
500 | assert((unsigned)plane_ind == repair_subchunks); | |
501 | ||
502 | int plane_count = 0; | |
503 | for (int i = 0; i < q*t; i++) { | |
504 | if (U_buf[i].length() == 0) { | |
505 | bufferptr buf(buffer::create_aligned(sub_chunk_no*sub_chunksize, SIMD_ALIGN)); | |
506 | buf.zero(); | |
507 | U_buf[i].push_back(std::move(buf)); | |
508 | } | |
509 | } | |
510 | ||
511 | int lost_chunk; | |
512 | int count = 0; | |
513 | for ([[maybe_unused]] auto& [node, bl] : recovered_data) { | |
514 | lost_chunk = node; | |
515 | count++; | |
516 | (void)bl; // silence -Wunused-variable | |
517 | } | |
518 | ceph_assert(count == 1); | |
519 | ||
520 | set<int> erasures; | |
521 | for (int i = 0; i < q; i++) { | |
522 | erasures.insert(lost_chunk - lost_chunk % q + i); | |
523 | } | |
524 | for (auto node : aloof_nodes) { | |
525 | erasures.insert(node); | |
526 | } | |
527 | ||
528 | for (int order = 1; ;order++) { | |
529 | if (ordered_planes.count(order) == 0) { | |
530 | break; | |
531 | } | |
532 | plane_count += ordered_planes[order].size(); | |
533 | for (auto z : ordered_planes[order]) { | |
534 | get_plane_vector(z, z_vec); | |
535 | ||
536 | for (int y = 0; y < t; y++) { | |
537 | for (int x = 0; x < q; x++) { | |
538 | int node_xy = y*q + x; | |
539 | map<int, bufferlist> known_subchunks; | |
540 | map<int, bufferlist> pftsubchunks; | |
541 | set<int> pft_erasures; | |
542 | if (erasures.count(node_xy) == 0) { | |
543 | assert(helper_data.count(node_xy) > 0); | |
544 | int z_sw = z + (x - z_vec[y])*pow_int(q,t-1-y); | |
545 | int node_sw = y*q + z_vec[y]; | |
546 | int i0 = 0, i1 = 1, i2 = 2, i3 = 3; | |
547 | if (z_vec[y] > x) { | |
548 | i0 = 1; | |
549 | i1 = 0; | |
550 | i2 = 3; | |
551 | i3 = 2; | |
552 | } | |
553 | if (aloof_nodes.count(node_sw) > 0) { | |
554 | assert(repair_plane_to_ind.count(z) > 0); | |
555 | assert(repair_plane_to_ind.count(z_sw) > 0); | |
556 | pft_erasures.insert(i2); | |
557 | known_subchunks[i0].substr_of(helper_data[node_xy], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize); | |
558 | known_subchunks[i3].substr_of(U_buf[node_sw], z_sw*sub_chunksize, sub_chunksize); | |
559 | pftsubchunks[i0] = known_subchunks[i0]; | |
560 | pftsubchunks[i1] = temp_buf; | |
561 | pftsubchunks[i2].substr_of(U_buf[node_xy], z*sub_chunksize, sub_chunksize); | |
562 | pftsubchunks[i3] = known_subchunks[i3]; | |
563 | for (int i=0; i<3; i++) { | |
564 | pftsubchunks[i].rebuild_aligned(SIMD_ALIGN); | |
565 | } | |
566 | pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks); | |
567 | } else { | |
568 | ceph_assert(helper_data.count(node_sw) > 0); | |
569 | ceph_assert(repair_plane_to_ind.count(z) > 0); | |
570 | if (z_vec[y] != x){ | |
571 | pft_erasures.insert(i2); | |
572 | ceph_assert(repair_plane_to_ind.count(z_sw) > 0); | |
573 | known_subchunks[i0].substr_of(helper_data[node_xy], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize); | |
574 | known_subchunks[i1].substr_of(helper_data[node_sw], repair_plane_to_ind[z_sw]*sub_chunksize, sub_chunksize); | |
575 | pftsubchunks[i0] = known_subchunks[i0]; | |
576 | pftsubchunks[i1] = known_subchunks[i1]; | |
577 | pftsubchunks[i2].substr_of(U_buf[node_xy], z*sub_chunksize, sub_chunksize); | |
578 | pftsubchunks[i3].substr_of(temp_buf, 0, sub_chunksize); | |
579 | for (int i=0; i<3; i++) { | |
580 | pftsubchunks[i].rebuild_aligned(SIMD_ALIGN); | |
581 | } | |
582 | pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks); | |
583 | } else { | |
584 | char* uncoupled_chunk = U_buf[node_xy].c_str(); | |
585 | char* coupled_chunk = helper_data[node_xy].c_str(); | |
586 | memcpy(&uncoupled_chunk[z*sub_chunksize], | |
587 | &coupled_chunk[repair_plane_to_ind[z]*sub_chunksize], | |
588 | sub_chunksize); | |
589 | } | |
590 | } | |
591 | } | |
592 | } // x | |
593 | } // y | |
594 | ceph_assert(erasures.size() <= (unsigned)m); | |
595 | decode_uncoupled(erasures, z, sub_chunksize); | |
596 | ||
597 | for (auto i : erasures) { | |
598 | int x = i % q; | |
599 | int y = i / q; | |
600 | int node_sw = y*q+z_vec[y]; | |
601 | int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y); | |
602 | set<int> pft_erasures; | |
603 | map<int, bufferlist> known_subchunks; | |
604 | map<int, bufferlist> pftsubchunks; | |
605 | int i0 = 0, i1 = 1, i2 = 2, i3 = 3; | |
606 | if (z_vec[y] > x) { | |
607 | i0 = 1; | |
608 | i1 = 0; | |
609 | i2 = 3; | |
610 | i3 = 2; | |
611 | } | |
612 | // make sure it is not an aloof node before you retrieve repaired_data | |
613 | if (aloof_nodes.count(i) == 0) { | |
614 | if (x == z_vec[y]) { // hole-dot pair (type 0) | |
615 | char* coupled_chunk = recovered_data[i].c_str(); | |
616 | char* uncoupled_chunk = U_buf[i].c_str(); | |
617 | memcpy(&coupled_chunk[z*sub_chunksize], | |
618 | &uncoupled_chunk[z*sub_chunksize], | |
619 | sub_chunksize); | |
620 | count_retrieved_sub_chunks++; | |
621 | } else { | |
622 | ceph_assert(y == lost_chunk / q); | |
623 | ceph_assert(node_sw == lost_chunk); | |
624 | ceph_assert(helper_data.count(i) > 0); | |
625 | pft_erasures.insert(i1); | |
626 | known_subchunks[i0].substr_of(helper_data[i], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize); | |
627 | known_subchunks[i2].substr_of(U_buf[i], z*sub_chunksize, sub_chunksize); | |
628 | ||
629 | pftsubchunks[i0] = known_subchunks[i0]; | |
630 | pftsubchunks[i1].substr_of(recovered_data[node_sw], z_sw*sub_chunksize, sub_chunksize); | |
631 | pftsubchunks[i2] = known_subchunks[i2]; | |
632 | pftsubchunks[i3] = temp_buf; | |
633 | for (int i=0; i<3; i++) { | |
634 | pftsubchunks[i].rebuild_aligned(SIMD_ALIGN); | |
635 | } | |
636 | pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks); | |
637 | } | |
638 | } | |
639 | } // recover all erasures | |
640 | } // planes of particular order | |
641 | } // order | |
642 | ||
643 | return 0; | |
644 | } | |
645 | ||
646 | ||
647 | int ErasureCodeClay::decode_layered(set<int> &erased_chunks, | |
648 | map<int, bufferlist> *chunks) | |
649 | { | |
650 | int num_erasures = erased_chunks.size(); | |
651 | ||
652 | int size = (*chunks)[0].length(); | |
653 | ceph_assert(size%sub_chunk_no == 0); | |
654 | int sc_size = size / sub_chunk_no; | |
655 | ||
656 | ceph_assert(num_erasures > 0); | |
657 | ||
658 | for (int i = k+nu; (num_erasures < m) && (i < q*t); i++) { | |
659 | if ([[maybe_unused]] auto [it, added] = erased_chunks.emplace(i); added) { | |
660 | num_erasures++; | |
661 | (void)it; // silence -Wunused-variable | |
662 | } | |
663 | } | |
664 | ceph_assert(num_erasures == m); | |
665 | ||
666 | int max_iscore = get_max_iscore(erased_chunks); | |
667 | int order[sub_chunk_no]; | |
668 | int z_vec[t]; | |
669 | for (int i = 0; i < q*t; i++) { | |
670 | if (U_buf[i].length() == 0) { | |
671 | bufferptr buf(buffer::create_aligned(size, SIMD_ALIGN)); | |
672 | buf.zero(); | |
673 | U_buf[i].push_back(std::move(buf)); | |
674 | } | |
675 | } | |
676 | ||
677 | set_planes_sequential_decoding_order(order, erased_chunks); | |
678 | ||
679 | for (int iscore = 0; iscore <= max_iscore; iscore++) { | |
680 | for (int z = 0; z < sub_chunk_no; z++) { | |
681 | if (order[z] == iscore) { | |
682 | decode_erasures(erased_chunks, z, chunks, sc_size); | |
683 | } | |
684 | } | |
685 | ||
686 | for (int z = 0; z < sub_chunk_no; z++) { | |
687 | if (order[z] == iscore) { | |
688 | get_plane_vector(z, z_vec); | |
689 | for (auto node_xy : erased_chunks) { | |
690 | int x = node_xy % q; | |
691 | int y = node_xy / q; | |
692 | int node_sw = y*q+z_vec[y]; | |
693 | if (z_vec[y] != x) { | |
694 | if (erased_chunks.count(node_sw) == 0) { | |
695 | recover_type1_erasure(chunks, x, y, z, z_vec, sc_size); | |
696 | } else if (z_vec[y] < x){ | |
697 | ceph_assert(erased_chunks.count(node_sw) > 0); | |
698 | ceph_assert(z_vec[y] != x); | |
699 | get_coupled_from_uncoupled(chunks, x, y, z, z_vec, sc_size); | |
700 | } | |
701 | } else { | |
702 | char* C = (*chunks)[node_xy].c_str(); | |
703 | char* U = U_buf[node_xy].c_str(); | |
704 | memcpy(&C[z*sc_size], &U[z*sc_size], sc_size); | |
705 | } | |
706 | } | |
707 | } | |
708 | } // plane | |
709 | } // iscore, order | |
710 | ||
711 | return 0; | |
712 | } | |
713 | ||
714 | int ErasureCodeClay::decode_erasures(const set<int>& erased_chunks, int z, | |
715 | map<int, bufferlist>* chunks, int sc_size) | |
716 | { | |
717 | int z_vec[t]; | |
718 | ||
719 | get_plane_vector(z, z_vec); | |
720 | ||
721 | for (int x = 0; x < q; x++) { | |
722 | for (int y = 0; y < t; y++) { | |
723 | int node_xy = q*y+x; | |
724 | int node_sw = q*y+z_vec[y]; | |
725 | if (erased_chunks.count(node_xy) == 0) { | |
726 | if (z_vec[y] < x) { | |
727 | get_uncoupled_from_coupled(chunks, x, y, z, z_vec, sc_size); | |
728 | } else if (z_vec[y] == x) { | |
729 | char* uncoupled_chunk = U_buf[node_xy].c_str(); | |
730 | char* coupled_chunk = (*chunks)[node_xy].c_str(); | |
731 | memcpy(&uncoupled_chunk[z*sc_size], &coupled_chunk[z*sc_size], sc_size); | |
732 | } else { | |
733 | if (erased_chunks.count(node_sw) > 0) { | |
734 | get_uncoupled_from_coupled(chunks, x, y, z, z_vec, sc_size); | |
735 | } | |
736 | } | |
737 | } | |
738 | } | |
739 | } | |
740 | return decode_uncoupled(erased_chunks, z, sc_size); | |
741 | } | |
742 | ||
743 | int ErasureCodeClay::decode_uncoupled(const set<int>& erased_chunks, int z, int sc_size) | |
744 | { | |
745 | map<int, bufferlist> known_subchunks; | |
746 | map<int, bufferlist> all_subchunks; | |
747 | ||
748 | for (int i = 0; i < q*t; i++) { | |
749 | if (erased_chunks.count(i) == 0) { | |
750 | known_subchunks[i].substr_of(U_buf[i], z*sc_size, sc_size); | |
751 | all_subchunks[i] = known_subchunks[i]; | |
752 | } else { | |
753 | all_subchunks[i].substr_of(U_buf[i], z*sc_size, sc_size); | |
754 | } | |
755 | all_subchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN); | |
756 | assert(all_subchunks[i].is_contiguous()); | |
757 | } | |
758 | ||
759 | mds.erasure_code->decode_chunks(erased_chunks, known_subchunks, &all_subchunks); | |
760 | return 0; | |
761 | } | |
762 | ||
763 | void ErasureCodeClay::set_planes_sequential_decoding_order(int* order, set<int>& erasures) { | |
764 | int z_vec[t]; | |
765 | for (int z = 0; z < sub_chunk_no; z++) { | |
766 | get_plane_vector(z,z_vec); | |
767 | order[z] = 0; | |
768 | for (auto i : erasures) { | |
769 | if (i % q == z_vec[i / q]) { | |
770 | order[z] = order[z] + 1; | |
771 | } | |
772 | } | |
773 | } | |
774 | } | |
775 | ||
776 | void ErasureCodeClay::recover_type1_erasure(map<int, bufferlist>* chunks, | |
777 | int x, int y, int z, | |
778 | int* z_vec, int sc_size) | |
779 | { | |
780 | set<int> erased_chunks; | |
781 | ||
782 | int node_xy = y*q+x; | |
783 | int node_sw = y*q+z_vec[y]; | |
784 | int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y); | |
785 | ||
786 | map<int, bufferlist> known_subchunks; | |
787 | map<int, bufferlist> pftsubchunks; | |
788 | bufferptr ptr(buffer::create_aligned(sc_size, SIMD_ALIGN)); | |
789 | ptr.zero(); | |
790 | ||
791 | int i0 = 0, i1 = 1, i2 = 2, i3 = 3; | |
792 | if (z_vec[y] > x) { | |
793 | i0 = 1; | |
794 | i1 = 0; | |
795 | i2 = 3; | |
796 | i3 = 2; | |
797 | } | |
798 | ||
799 | erased_chunks.insert(i0); | |
800 | pftsubchunks[i0].substr_of((*chunks)[node_xy], z * sc_size, sc_size); | |
801 | known_subchunks[i1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size); | |
802 | known_subchunks[i2].substr_of(U_buf[node_xy], z * sc_size, sc_size); | |
803 | pftsubchunks[i1] = known_subchunks[i1]; | |
804 | pftsubchunks[i2] = known_subchunks[i2]; | |
805 | pftsubchunks[i3].push_back(ptr); | |
806 | ||
807 | for (int i=0; i<3; i++) { | |
808 | pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN); | |
809 | } | |
810 | ||
811 | pft.erasure_code->decode_chunks(erased_chunks, known_subchunks, &pftsubchunks); | |
812 | } | |
813 | ||
814 | void ErasureCodeClay::get_coupled_from_uncoupled(map<int, bufferlist>* chunks, | |
815 | int x, int y, int z, | |
816 | int* z_vec, int sc_size) | |
817 | { | |
818 | set<int> erased_chunks = {0, 1}; | |
819 | ||
820 | int node_xy = y*q+x; | |
821 | int node_sw = y*q+z_vec[y]; | |
822 | int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y); | |
823 | ||
824 | ceph_assert(z_vec[y] < x); | |
825 | map<int, bufferlist> uncoupled_subchunks; | |
826 | uncoupled_subchunks[2].substr_of(U_buf[node_xy], z * sc_size, sc_size); | |
827 | uncoupled_subchunks[3].substr_of(U_buf[node_sw], z_sw * sc_size, sc_size); | |
828 | ||
829 | map<int, bufferlist> pftsubchunks; | |
830 | pftsubchunks[0].substr_of((*chunks)[node_xy], z * sc_size, sc_size); | |
831 | pftsubchunks[1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size); | |
832 | pftsubchunks[2] = uncoupled_subchunks[2]; | |
833 | pftsubchunks[3] = uncoupled_subchunks[3]; | |
834 | ||
835 | for (int i=0; i<3; i++) { | |
836 | pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN); | |
837 | } | |
838 | pft.erasure_code->decode_chunks(erased_chunks, uncoupled_subchunks, &pftsubchunks); | |
839 | } | |
840 | ||
841 | void ErasureCodeClay::get_uncoupled_from_coupled(map<int, bufferlist>* chunks, | |
842 | int x, int y, int z, | |
843 | int* z_vec, int sc_size) | |
844 | { | |
845 | set<int> erased_chunks = {2, 3}; | |
846 | ||
847 | int node_xy = y*q+x; | |
848 | int node_sw = y*q+z_vec[y]; | |
849 | int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y); | |
850 | ||
851 | int i0 = 0, i1 = 1, i2 = 2, i3 = 3; | |
852 | if (z_vec[y] > x) { | |
853 | i0 = 1; | |
854 | i1 = 0; | |
855 | i2 = 3; | |
856 | i3 = 2; | |
857 | } | |
858 | map<int, bufferlist> coupled_subchunks; | |
859 | coupled_subchunks[i0].substr_of((*chunks)[node_xy], z * sc_size, sc_size); | |
860 | coupled_subchunks[i1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size); | |
861 | ||
862 | map<int, bufferlist> pftsubchunks; | |
863 | pftsubchunks[0] = coupled_subchunks[0]; | |
864 | pftsubchunks[1] = coupled_subchunks[1]; | |
865 | pftsubchunks[i2].substr_of(U_buf[node_xy], z * sc_size, sc_size); | |
866 | pftsubchunks[i3].substr_of(U_buf[node_sw], z_sw * sc_size, sc_size); | |
867 | for (int i=0; i<3; i++) { | |
868 | pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN); | |
869 | } | |
870 | pft.erasure_code->decode_chunks(erased_chunks, coupled_subchunks, &pftsubchunks); | |
871 | } | |
872 | ||
873 | int ErasureCodeClay::get_max_iscore(set<int>& erased_chunks) | |
874 | { | |
875 | int weight_vec[t]; | |
876 | int iscore = 0; | |
877 | memset(weight_vec, 0, sizeof(int)*t); | |
878 | ||
879 | for (auto i : erased_chunks) { | |
880 | if (weight_vec[i / q] == 0) { | |
881 | weight_vec[i / q] = 1; | |
882 | iscore++; | |
883 | } | |
884 | } | |
885 | return iscore; | |
886 | } | |
887 | ||
888 | void ErasureCodeClay::get_plane_vector(int z, int* z_vec) | |
889 | { | |
890 | for (int i = 0; i < t; i++) { | |
891 | z_vec[t-1-i] = z % q; | |
892 | z = (z - z_vec[t-1-i]) / q; | |
893 | } | |
894 | } |