1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 FUJITSU LIMITED
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Takanori Nakao <nakao.takanori@jp.fujitsu.com>
11 * Author: Takeshi Miyamae <miyamae.takeshi@jp.fujitsu.com>
12 * Author: Loic Dachary <loic@dachary.org>
14 * This library is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU Lesser General Public
16 * License as published by the Free Software Foundation; either
17 * version 2.1 of the License, or (at your option) any later version.
28 #include "common/debug.h"
29 #include "ErasureCodeShec.h"
30 #include "crush/CrushWrapper.h"
31 #include "osd/osd_types.h"
33 #include "jerasure/include/jerasure.h"
34 #include "jerasure/include/galois.h"
36 extern int calc_determinant(int *matrix
, int dim
);
37 extern int* reed_sol_vandermonde_coding_matrix(int k
, int m
, int w
);
40 #define dout_context g_ceph_context
41 #define dout_subsys ceph_subsys_osd
43 #define dout_prefix _prefix(_dout)
45 static ostream
& _prefix(std::ostream
* _dout
)
47 return *_dout
<< "ErasureCodeShec: ";
50 int ErasureCodeShec::create_ruleset(const string
&name
,
54 int ruleid
= crush
.add_simple_rule(
55 name
, ruleset_root
, ruleset_failure_domain
,
56 "indep", pg_pool_t::TYPE_ERASURE
, ss
);
60 crush
.set_rule_mask_max_size(ruleid
, get_chunk_count());
61 return crush
.get_rule_mask_ruleset(ruleid
);
65 int ErasureCodeShec::init(ErasureCodeProfile
&profile
,
69 err
|= ErasureCode::to_string("ruleset-root", profile
,
71 DEFAULT_RULESET_ROOT
, ss
);
72 err
|= ErasureCode::to_string("ruleset-failure-domain", profile
,
73 &ruleset_failure_domain
,
74 DEFAULT_RULESET_FAILURE_DOMAIN
, ss
);
75 err
|= parse(profile
);
79 ErasureCode::init(profile
, ss
);
83 unsigned int ErasureCodeShec::get_chunk_size(unsigned int object_size
) const
85 unsigned alignment
= get_alignment();
86 unsigned tail
= object_size
% alignment
;
87 unsigned padded_length
= object_size
+ ( tail
? ( alignment
- tail
) : 0 );
89 assert(padded_length
% k
== 0);
90 return padded_length
/ k
;
93 int ErasureCodeShec::minimum_to_decode(const set
<int> &want_to_read
,
94 const set
<int> &available_chunks
,
95 set
<int> *minimum_chunks
)
97 if (!minimum_chunks
) return -EINVAL
;
99 for (set
<int>::iterator it
= available_chunks
.begin(); it
!= available_chunks
.end(); ++it
){
100 if (*it
< 0 || k
+m
<= *it
) return -EINVAL
;
103 for (set
<int>::iterator it
= want_to_read
.begin(); it
!= want_to_read
.end(); ++it
){
104 if (*it
< 0 || k
+m
<= *it
) return -EINVAL
;
111 memset(want
, 0, sizeof(want
));
112 memset(avails
, 0, sizeof(avails
));
113 memset(minimum
, 0, sizeof(minimum
));
114 (*minimum_chunks
).clear();
116 for (set
<int>::const_iterator i
= want_to_read
.begin();
117 i
!= want_to_read
.end();
122 for (set
<int>::const_iterator i
= available_chunks
.begin();
123 i
!= available_chunks
.end();
129 int decoding_matrix
[k
*k
];
132 memset(decoding_matrix
, 0, sizeof(decoding_matrix
));
133 memset(dm_row
, 0, sizeof(dm_row
));
134 memset(dm_column
, 0, sizeof(dm_column
));
135 if (shec_make_decoding_matrix(true, want
, avails
, decoding_matrix
, dm_row
, dm_column
, minimum
) < 0) {
140 for (int i
= 0; i
< k
+ m
; i
++) {
141 if (minimum
[i
] == 1) minimum_chunks
->insert(i
);
147 int ErasureCodeShec::minimum_to_decode_with_cost(const set
<int> &want_to_read
,
148 const map
<int, int> &available
,
149 set
<int> *minimum_chunks
)
151 set
<int> available_chunks
;
153 for (map
<int, int>::const_iterator i
= available
.begin();
154 i
!= available
.end();
156 available_chunks
.insert(i
->first
);
158 return minimum_to_decode(want_to_read
, available_chunks
, minimum_chunks
);
161 int ErasureCodeShec::encode(const set
<int> &want_to_encode
,
162 const bufferlist
&in
,
163 map
<int, bufferlist
> *encoded
)
165 unsigned int k
= get_data_chunk_count();
166 unsigned int m
= get_chunk_count() - k
;
169 if (!encoded
|| !encoded
->empty()){
173 int err
= encode_prepare(in
, *encoded
);
176 encode_chunks(want_to_encode
, encoded
);
177 for (unsigned int i
= 0; i
< k
+ m
; i
++) {
178 if (want_to_encode
.count(i
) == 0)
184 int ErasureCodeShec::encode_chunks(const set
<int> &want_to_encode
,
185 map
<int, bufferlist
> *encoded
)
188 for (int i
= 0; i
< k
+ m
; i
++){
189 chunks
[i
] = (*encoded
)[i
].c_str();
191 shec_encode(&chunks
[0], &chunks
[k
], (*encoded
)[0].length());
195 int ErasureCodeShec::decode(const set
<int> &want_to_read
,
196 const map
<int, bufferlist
> &chunks
,
197 map
<int, bufferlist
> *decoded
)
201 if (!decoded
|| !decoded
->empty()){
205 have
.reserve(chunks
.size());
206 for (map
<int, bufferlist
>::const_iterator i
= chunks
.begin();
209 have
.push_back(i
->first
);
212 have
.begin(), have
.end(), want_to_read
.begin(), want_to_read
.end())) {
213 for (set
<int>::iterator i
= want_to_read
.begin();
214 i
!= want_to_read
.end();
216 (*decoded
)[*i
] = chunks
.find(*i
)->second
;
220 unsigned int k
= get_data_chunk_count();
221 unsigned int m
= get_chunk_count() - k
;
222 unsigned blocksize
= (*chunks
.begin()).second
.length();
223 for (unsigned int i
= 0; i
< k
+ m
; i
++) {
224 if (chunks
.find(i
) == chunks
.end()) {
225 bufferptr
ptr(buffer::create_aligned(blocksize
, SIMD_ALIGN
));
226 (*decoded
)[i
].push_front(ptr
);
228 (*decoded
)[i
] = chunks
.find(i
)->second
;
229 (*decoded
)[i
].rebuild_aligned(SIMD_ALIGN
);
232 return decode_chunks(want_to_read
, chunks
, decoded
);
235 int ErasureCodeShec::decode_chunks(const set
<int> &want_to_read
,
236 const map
<int, bufferlist
> &chunks
,
237 map
<int, bufferlist
> *decoded
)
239 unsigned blocksize
= (*chunks
.begin()).second
.length();
241 int erased_count
= 0;
246 for (int i
= 0; i
< k
+ m
; i
++) {
248 if (chunks
.find(i
) == chunks
.end()) {
249 if (want_to_read
.count(i
) > 0) {
258 data
[i
] = (*decoded
)[i
].c_str();
260 coding
[i
- k
] = (*decoded
)[i
].c_str();
263 if (erased_count
> 0) {
264 return shec_decode(erased
, avails
, data
, coding
, blocksize
);
271 // ErasureCodeShecReedSolomonVandermonde
274 void ErasureCodeShecReedSolomonVandermonde::shec_encode(char **data
,
278 jerasure_matrix_encode(k
, m
, w
, matrix
, data
, coding
, blocksize
);
281 int ErasureCodeShecReedSolomonVandermonde::shec_decode(int *erased
,
287 return shec_matrix_decode(erased
, avails
, data
, coding
, blocksize
);
290 unsigned ErasureCodeShecReedSolomonVandermonde::get_alignment() const
292 return k
*w
*sizeof(int);
295 int ErasureCodeShecReedSolomonVandermonde::parse(const ErasureCodeProfile
&profile
)
299 if (profile
.find("k") == profile
.end() &&
300 profile
.find("m") == profile
.end() &&
301 profile
.find("c") == profile
.end()){
302 dout(10) << "(k, m, c) default to " << "(" << DEFAULT_K
303 << ", " << DEFAULT_M
<< ", " << DEFAULT_C
<< ")" << dendl
;
304 k
= DEFAULT_K
; m
= DEFAULT_M
; c
= DEFAULT_C
;
305 } else if (profile
.find("k") == profile
.end() ||
306 profile
.find("m") == profile
.end() ||
307 profile
.find("c") == profile
.end()){
308 dout(10) << "(k, m, c) must be chosen" << dendl
;
311 std::string err_k
, err_m
, err_c
, value_k
, value_m
, value_c
;
312 value_k
= profile
.find("k")->second
;
313 value_m
= profile
.find("m")->second
;
314 value_c
= profile
.find("c")->second
;
315 k
= strict_strtol(value_k
.c_str(), 10, &err_k
);
316 m
= strict_strtol(value_m
.c_str(), 10, &err_m
);
317 c
= strict_strtol(value_c
.c_str(), 10, &err_c
);
319 if (!err_k
.empty() || !err_m
.empty() || !err_c
.empty()){
321 derr
<< "could not convert k=" << value_k
<< "to int" << dendl
;
322 } else if (!err_m
.empty()){
323 derr
<< "could not convert m=" << value_m
<< "to int" << dendl
;
324 } else if (!err_c
.empty()){
325 derr
<< "could not convert c=" << value_c
<< "to int" << dendl
;
330 << " must be a positive number" << dendl
;
334 << " must be a positive number" << dendl
;
338 << " must be a positive number" << dendl
;
342 << " must be less than or equal to m=" << m
<< dendl
;
346 << " must be less than or equal to 12" << dendl
;
348 } else if (k
+m
> 20){
349 derr
<< "k+m=" << k
+m
350 << " must be less than or equal to 20" << dendl
;
354 << " must be less than or equal to k=" << k
<< dendl
;
360 derr
<< "(k, m, c)=(" << k
<< ", " << m
<< ", " << c
361 << ") is not a valid parameter." << dendl
;
365 dout(10) << "(k, m, c) set to " << "(" << k
<< ", " << m
<< ", "
369 if (profile
.find("w") == profile
.end()){
370 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
373 std::string err_w
, value_w
;
374 value_w
= profile
.find("w")->second
;
375 w
= strict_strtol(value_w
.c_str(), 10, &err_w
);
378 derr
<< "could not convert w=" << value_w
<< "to int" << dendl
;
379 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
382 } else if (w
!= 8 && w
!= 16 && w
!= 32) {
384 << " must be one of {8, 16, 32}" << dendl
;
385 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
389 dout(10) << "w set to " << w
<< dendl
;
395 void ErasureCodeShecReedSolomonVandermonde::prepare()
397 // setup shared encoding table
399 tcache
.getEncodingTable(technique
, k
, m
, c
, w
);
402 dout(10) << "[ cache tables ] creating coeff for k=" <<
403 k
<< " m=" << m
<< " c=" << c
<< " w=" << w
<< dendl
;
405 matrix
= shec_reedsolomon_coding_matrix(technique
);
407 // either our new created table is stored or if it has been
408 // created in the meanwhile the locally allocated table will be
409 // freed by setEncodingTable
410 matrix
= tcache
.setEncodingTable(technique
, k
, m
, c
, w
, matrix
);
412 dout(10) << "matrix = " << dendl
;
413 for (int i
=0; i
<m
; i
++) {
415 for (int j
=0; j
<k
; j
++) {
416 if (matrix
[i
*k
+j
] > 0) {
423 dout(10) << mat
<< dendl
;
426 matrix
= *p_enc_table
;
429 dout(10) << " [ technique ] = " <<
430 ((technique
== MULTIPLE
) ? "multiple" : "single") << dendl
;
432 assert((technique
== SINGLE
) || (technique
== MULTIPLE
));
437 // Mearged from shec.cc.
439 double ErasureCodeShec::shec_calc_recovery_efficiency1(int k
, int m1
, int m2
, int c1
, int c2
){
442 int i
, rr
, cc
, start
, end
;
445 if (m1
< c1
|| m2
< c2
) return -1;
446 if ((m1
== 0 && c1
!= 0) || (m2
== 0 && c2
!= 0)) return -1;
448 for (i
=0; i
<k
; i
++) r_eff_k
[i
] = 100000000;
451 for (rr
=0; rr
<m1
; rr
++){
452 start
= ((rr
*k
)/m1
) % k
;
453 end
= (((rr
+c1
)*k
)/m1
) % k
;
454 for (cc
=start
, first_flag
=1; first_flag
|| cc
!=end
; cc
=(cc
+1)%k
){
456 r_eff_k
[cc
] = std::min(r_eff_k
[cc
], ((rr
+c1
)*k
)/m1
- (rr
*k
)/m1
);
458 r_e1
+= ((rr
+c1
)*k
)/m1
- (rr
*k
)/m1
;
461 for (rr
=0; rr
<m2
; rr
++){
462 start
= ((rr
*k
)/m2
) % k
;
463 end
= (((rr
+c2
)*k
)/m2
) % k
;
464 for (cc
=start
, first_flag
=1; first_flag
|| cc
!=end
; cc
=(cc
+1)%k
){
466 r_eff_k
[cc
] = std::min(r_eff_k
[cc
], ((rr
+c2
)*k
)/m2
- (rr
*k
)/m2
);
468 r_e1
+= ((rr
+c2
)*k
)/m2
- (rr
*k
)/m2
;
480 int* ErasureCodeShec::shec_reedsolomon_coding_matrix(int is_single
)
483 int rr
, cc
, start
, end
;
486 if (w
!= 8 && w
!= 16 && w
!= 32) return NULL
;
489 int c1_best
= -1, m1_best
= -1;
490 double min_r_e1
= 100.0;
492 // create all multiple shec pattern and choose best.
494 for (c1
=0; c1
<= c
/2; c1
++){
495 for (m1
=0; m1
<= m
; m1
++){
499 if (m1
< c1
|| m2
< c2
) continue;
500 if ((m1
== 0 && c1
!= 0) || (m2
== 0 && c2
!= 0)) continue;
501 if ((m1
!= 0 && c1
== 0) || (m2
!= 0 && c2
== 0)) continue;
507 r_e1
= shec_calc_recovery_efficiency1(k
, m1
, m2
, c1
, c2
);
508 if (min_r_e1
- r_e1
> std::numeric_limits
<double>::epsilon() &&
529 matrix
= reed_sol_vandermonde_coding_matrix(k
, m
, w
);
531 for (rr
=0; rr
<m1
; rr
++){
532 end
= ((rr
*k
)/m1
) % k
;
533 start
= (((rr
+c1
)*k
)/m1
) % k
;
534 for (cc
=start
; cc
!=end
; cc
=(cc
+1)%k
){
535 matrix
[cc
+ rr
*k
] = 0;
539 for (rr
=0; rr
<m2
; rr
++){
540 end
= ((rr
*k
)/m2
) % k
;
541 start
= (((rr
+c2
)*k
)/m2
) % k
;
542 for (cc
=start
; cc
!=end
; cc
=(cc
+1)%k
){
543 matrix
[cc
+ (rr
+m1
)*k
] = 0;
550 int ErasureCodeShec::shec_make_decoding_matrix(bool prepare
, int *want_
, int *avails
,
551 int *decoding_matrix
, int *dm_row
, int *dm_column
,
554 int mindup
= k
+1, minp
= k
+1;
557 memset(want
, 0, sizeof(want
));
559 for (int i
= 0; i
< k
+ m
; ++i
) {
563 for (int i
= 0; i
< m
; ++i
) {
564 if (want
[i
+ k
] && !avails
[i
+ k
]) {
565 for (int j
=0; j
< k
; ++j
) {
566 if (matrix
[i
* k
+ j
] > 0) {
573 if (tcache
.getDecodingTableFromCache(decoding_matrix
,
574 dm_row
, dm_column
, minimum
,
581 for (unsigned long long pp
= 0; pp
< (1ull << m
); ++pp
) {
583 // select parity chunks
586 for (int i
=0; i
< m
; ++i
) {
587 if (pp
& (1ull << i
)) {
595 // Are selected parity chunks avail?
597 for (int i
= 0; i
< ek
&& ok
; i
++) {
598 if (!avails
[k
+p
[i
]]) {
610 for (int i
= 0; i
< k
+ m
; i
++) {
613 for (int i
= 0; i
< k
; i
++) {
617 for (int i
=0; i
< k
; i
++) {
618 if (want
[i
] && !avails
[i
]) {
623 // Parity chunks which are used to recovery erased data chunks, are added to tmprow.
624 for (int i
= 0; i
< ek
; i
++) {
625 tmprow
[k
+ p
[i
]] = 1;
626 for (int j
= 0; j
< k
; j
++) {
627 int element
= matrix
[(p
[i
]) * k
+ j
];
631 if (element
!= 0 && avails
[j
] == 1) {
637 int dup_row
= 0, dup_column
= 0, dup
= 0;
638 for (int i
= 0; i
< k
+ m
; i
++) {
644 for (int i
= 0; i
< k
; i
++) {
650 if (dup_row
!= dup_column
) {
656 for (int i
= 0; i
< k
; i
++) {
659 for (int i
= 0; i
< k
; i
++) {
665 // minimum is updated.
667 int tmpmat
[dup
* dup
];
669 for (int i
= 0, row
= 0; i
< k
+ m
; i
++) {
671 for (int j
= 0, column
= 0; j
< k
; j
++) {
674 tmpmat
[row
* dup
+ column
] = (i
== j
? 1 : 0);
676 tmpmat
[row
* dup
+ column
] = matrix
[(i
- k
) * k
+ j
];
685 int det
= calc_determinant(tmpmat
, dup
);
690 for (int i
= 0; i
< k
; i
++) {
693 for (int i
= 0; i
< k
; i
++) {
698 for (int i
=0; i
< k
+ m
; i
++) {
700 dm_row
[row_id
++] = i
;
703 for (int i
=0; i
< k
; i
++) {
705 dm_column
[column_id
++] = i
;
715 fprintf(stderr
, "shec_make_decoding_matrix(): can't find recover matrix.\n");
719 for (int i
= 0; i
< k
+ m
; i
++) {
723 for (int i
=0; i
< k
&& dm_row
[i
] != -1; i
++) {
724 minimum
[dm_row
[i
]] = 1;
727 for (int i
= 0; i
< k
; ++i
) {
728 if (want
[i
] && avails
[i
]) {
733 for (int i
= 0; i
< m
; ++i
) {
734 if (want
[k
+ i
] && avails
[k
+ i
] && !minimum
[k
+ i
]) {
735 for (int j
= 0; j
< k
; ++j
) {
736 if (matrix
[i
* k
+ j
] > 0 && !want
[j
]) {
748 int tmpmat
[mindup
* mindup
];
749 for (int i
=0; i
< mindup
; i
++) {
750 for (int j
=0; j
< mindup
; j
++) {
752 tmpmat
[i
* mindup
+ j
] = (dm_row
[i
] == dm_column
[j
] ? 1 : 0);
754 tmpmat
[i
* mindup
+ j
] = matrix
[(dm_row
[i
] - k
) * k
+ dm_column
[j
]];
758 for (int j
= 0; j
< mindup
; j
++) {
759 if (dm_row
[i
] == dm_column
[j
]) {
764 dm_row
[i
] -= (k
- mindup
);
772 int ret
= jerasure_invert_matrix(tmpmat
, decoding_matrix
, mindup
, w
);
774 tcache
.putDecodingTableToCache(decoding_matrix
, dm_row
, dm_column
, minimum
, technique
,
775 k
, m
, c
, w
, want
, avails
);
780 int ErasureCodeShec::shec_matrix_decode(int *want
, int *avails
, char **data_ptrs
,
781 char **coding_ptrs
, int size
)
783 int decoding_matrix
[k
*k
];
784 int dm_row
[k
], dm_column
[k
];
787 memset(decoding_matrix
, 0, sizeof(decoding_matrix
));
788 memset(dm_row
, -1, sizeof(dm_row
));
789 memset(dm_column
, -1, sizeof(dm_column
));
790 memset(minimum
, -1, sizeof(minimum
));
792 if (w
!= 8 && w
!= 16 && w
!= 32) return -1;
794 if (shec_make_decoding_matrix(false, want
, avails
, decoding_matrix
,
795 dm_row
, dm_column
, minimum
) < 0) {
799 // Get decoding matrix size
801 for (int i
= 0; i
< k
; i
++) {
802 if (dm_row
[i
] == -1) {
808 char *dm_data_ptrs
[dm_size
];
809 for (int i
= 0; i
< dm_size
; i
++) {
810 dm_data_ptrs
[i
] = data_ptrs
[dm_column
[i
]];
813 // Decode the data drives
814 for (int i
= 0; i
< dm_size
; i
++) {
815 if (!avails
[dm_column
[i
]]) {
816 jerasure_matrix_dotprod(dm_size
, w
, decoding_matrix
+ (i
* dm_size
),
817 dm_row
, i
, dm_data_ptrs
, coding_ptrs
, size
);
821 // Re-encode any erased coding devices
822 for (int i
= 0; i
< m
; i
++) {
823 if (want
[k
+i
] && !avails
[k
+i
]) {
824 jerasure_matrix_dotprod(k
, w
, matrix
+ (i
* k
), NULL
, i
+k
,
825 data_ptrs
, coding_ptrs
, size
);