]>
git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/shec/ErasureCodeShec.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 FUJITSU LIMITED
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Takanori Nakao <nakao.takanori@jp.fujitsu.com>
11 * Author: Takeshi Miyamae <miyamae.takeshi@jp.fujitsu.com>
12 * Author: Loic Dachary <loic@dachary.org>
14 * This library is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU Lesser General Public
16 * License as published by the Free Software Foundation; either
17 * version 2.1 of the License, or (at your option) any later version.
28 #include "common/debug.h"
29 #include "ErasureCodeShec.h"
31 #include "jerasure/include/jerasure.h"
32 #include "jerasure/include/galois.h"
34 extern int calc_determinant(int *matrix
, int dim
);
35 extern int* reed_sol_vandermonde_coding_matrix(int k
, int m
, int w
);
38 #define dout_context g_ceph_context
39 #define dout_subsys ceph_subsys_osd
41 #define dout_prefix _prefix(_dout)
43 static ostream
& _prefix(std::ostream
* _dout
)
45 return *_dout
<< "ErasureCodeShec: ";
48 int ErasureCodeShec::init(ErasureCodeProfile
&profile
,
52 err
|= parse(profile
);
56 return ErasureCode::init(profile
, ss
);
59 unsigned int ErasureCodeShec::get_chunk_size(unsigned int object_size
) const
61 unsigned alignment
= get_alignment();
62 unsigned tail
= object_size
% alignment
;
63 unsigned padded_length
= object_size
+ ( tail
? ( alignment
- tail
) : 0 );
65 ceph_assert(padded_length
% k
== 0);
66 return padded_length
/ k
;
69 int ErasureCodeShec::_minimum_to_decode(const set
<int> &want_to_read
,
70 const set
<int> &available_chunks
,
71 set
<int> *minimum_chunks
)
73 if (!minimum_chunks
) return -EINVAL
;
75 for (set
<int>::iterator it
= available_chunks
.begin(); it
!= available_chunks
.end(); ++it
){
76 if (*it
< 0 || k
+m
<= *it
) return -EINVAL
;
79 for (set
<int>::iterator it
= want_to_read
.begin(); it
!= want_to_read
.end(); ++it
){
80 if (*it
< 0 || k
+m
<= *it
) return -EINVAL
;
87 memset(want
, 0, sizeof(want
));
88 memset(avails
, 0, sizeof(avails
));
89 memset(minimum
, 0, sizeof(minimum
));
90 (*minimum_chunks
).clear();
92 for (set
<int>::const_iterator i
= want_to_read
.begin();
93 i
!= want_to_read
.end();
98 for (set
<int>::const_iterator i
= available_chunks
.begin();
99 i
!= available_chunks
.end();
105 int decoding_matrix
[k
*k
];
108 memset(decoding_matrix
, 0, sizeof(decoding_matrix
));
109 memset(dm_row
, 0, sizeof(dm_row
));
110 memset(dm_column
, 0, sizeof(dm_column
));
111 if (shec_make_decoding_matrix(true, want
, avails
, decoding_matrix
, dm_row
, dm_column
, minimum
) < 0) {
116 for (int i
= 0; i
< k
+ m
; i
++) {
117 if (minimum
[i
] == 1) minimum_chunks
->insert(i
);
123 int ErasureCodeShec::minimum_to_decode_with_cost(const set
<int> &want_to_read
,
124 const map
<int, int> &available
,
125 set
<int> *minimum_chunks
)
127 set
<int> available_chunks
;
129 for (map
<int, int>::const_iterator i
= available
.begin();
130 i
!= available
.end();
132 available_chunks
.insert(i
->first
);
134 return _minimum_to_decode(want_to_read
, available_chunks
, minimum_chunks
);
137 int ErasureCodeShec::encode(const set
<int> &want_to_encode
,
138 const bufferlist
&in
,
139 map
<int, bufferlist
> *encoded
)
141 unsigned int k
= get_data_chunk_count();
142 unsigned int m
= get_chunk_count() - k
;
145 if (!encoded
|| !encoded
->empty()){
149 int err
= encode_prepare(in
, *encoded
);
152 encode_chunks(want_to_encode
, encoded
);
153 for (unsigned int i
= 0; i
< k
+ m
; i
++) {
154 if (want_to_encode
.count(i
) == 0)
160 int ErasureCodeShec::encode_chunks(const set
<int> &want_to_encode
,
161 map
<int, bufferlist
> *encoded
)
164 for (int i
= 0; i
< k
+ m
; i
++){
165 chunks
[i
] = (*encoded
)[i
].c_str();
167 shec_encode(&chunks
[0], &chunks
[k
], (*encoded
)[0].length());
171 int ErasureCodeShec::_decode(const set
<int> &want_to_read
,
172 const map
<int, bufferlist
> &chunks
,
173 map
<int, bufferlist
> *decoded
)
177 if (!decoded
|| !decoded
->empty()){
181 have
.reserve(chunks
.size());
182 for (map
<int, bufferlist
>::const_iterator i
= chunks
.begin();
185 have
.push_back(i
->first
);
188 have
.begin(), have
.end(), want_to_read
.begin(), want_to_read
.end())) {
189 for (set
<int>::iterator i
= want_to_read
.begin();
190 i
!= want_to_read
.end();
192 (*decoded
)[*i
] = chunks
.find(*i
)->second
;
196 unsigned int k
= get_data_chunk_count();
197 unsigned int m
= get_chunk_count() - k
;
198 unsigned blocksize
= (*chunks
.begin()).second
.length();
199 for (unsigned int i
= 0; i
< k
+ m
; i
++) {
200 if (chunks
.find(i
) == chunks
.end()) {
202 bufferptr
ptr(buffer::create_aligned(blocksize
, SIMD_ALIGN
));
204 tmp
.claim_append((*decoded
)[i
]);
205 (*decoded
)[i
].swap(tmp
);
207 (*decoded
)[i
] = chunks
.find(i
)->second
;
208 (*decoded
)[i
].rebuild_aligned(SIMD_ALIGN
);
211 return decode_chunks(want_to_read
, chunks
, decoded
);
214 int ErasureCodeShec::decode_chunks(const set
<int> &want_to_read
,
215 const map
<int, bufferlist
> &chunks
,
216 map
<int, bufferlist
> *decoded
)
218 unsigned blocksize
= (*chunks
.begin()).second
.length();
220 int erased_count
= 0;
225 for (int i
= 0; i
< k
+ m
; i
++) {
227 if (chunks
.find(i
) == chunks
.end()) {
228 if (want_to_read
.count(i
) > 0) {
237 data
[i
] = (*decoded
)[i
].c_str();
239 coding
[i
- k
] = (*decoded
)[i
].c_str();
242 if (erased_count
> 0) {
243 return shec_decode(erased
, avails
, data
, coding
, blocksize
);
250 // ErasureCodeShecReedSolomonVandermonde
253 void ErasureCodeShecReedSolomonVandermonde::shec_encode(char **data
,
257 jerasure_matrix_encode(k
, m
, w
, matrix
, data
, coding
, blocksize
);
260 int ErasureCodeShecReedSolomonVandermonde::shec_decode(int *erased
,
266 return shec_matrix_decode(erased
, avails
, data
, coding
, blocksize
);
269 unsigned ErasureCodeShecReedSolomonVandermonde::get_alignment() const
271 return k
*w
*sizeof(int);
274 int ErasureCodeShecReedSolomonVandermonde::parse(const ErasureCodeProfile
&profile
)
278 if (profile
.find("k") == profile
.end() &&
279 profile
.find("m") == profile
.end() &&
280 profile
.find("c") == profile
.end()){
281 dout(10) << "(k, m, c) default to " << "(" << DEFAULT_K
282 << ", " << DEFAULT_M
<< ", " << DEFAULT_C
<< ")" << dendl
;
283 k
= DEFAULT_K
; m
= DEFAULT_M
; c
= DEFAULT_C
;
284 } else if (profile
.find("k") == profile
.end() ||
285 profile
.find("m") == profile
.end() ||
286 profile
.find("c") == profile
.end()){
287 dout(10) << "(k, m, c) must be chosen" << dendl
;
290 std::string err_k
, err_m
, err_c
, value_k
, value_m
, value_c
;
291 value_k
= profile
.find("k")->second
;
292 value_m
= profile
.find("m")->second
;
293 value_c
= profile
.find("c")->second
;
294 k
= strict_strtol(value_k
.c_str(), 10, &err_k
);
295 m
= strict_strtol(value_m
.c_str(), 10, &err_m
);
296 c
= strict_strtol(value_c
.c_str(), 10, &err_c
);
298 if (!err_k
.empty() || !err_m
.empty() || !err_c
.empty()){
300 derr
<< "could not convert k=" << value_k
<< "to int" << dendl
;
301 } else if (!err_m
.empty()){
302 derr
<< "could not convert m=" << value_m
<< "to int" << dendl
;
303 } else if (!err_c
.empty()){
304 derr
<< "could not convert c=" << value_c
<< "to int" << dendl
;
309 << " must be a positive number" << dendl
;
313 << " must be a positive number" << dendl
;
317 << " must be a positive number" << dendl
;
321 << " must be less than or equal to m=" << m
<< dendl
;
325 << " must be less than or equal to 12" << dendl
;
327 } else if (k
+m
> 20){
328 derr
<< "k+m=" << k
+m
329 << " must be less than or equal to 20" << dendl
;
333 << " must be less than or equal to k=" << k
<< dendl
;
339 derr
<< "(k, m, c)=(" << k
<< ", " << m
<< ", " << c
340 << ") is not a valid parameter." << dendl
;
344 dout(10) << "(k, m, c) set to " << "(" << k
<< ", " << m
<< ", "
348 if (profile
.find("w") == profile
.end()){
349 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
352 std::string err_w
, value_w
;
353 value_w
= profile
.find("w")->second
;
354 w
= strict_strtol(value_w
.c_str(), 10, &err_w
);
357 derr
<< "could not convert w=" << value_w
<< "to int" << dendl
;
358 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
361 } else if (w
!= 8 && w
!= 16 && w
!= 32) {
363 << " must be one of {8, 16, 32}" << dendl
;
364 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
368 dout(10) << "w set to " << w
<< dendl
;
374 void ErasureCodeShecReedSolomonVandermonde::prepare()
376 // setup shared encoding table
378 tcache
.getEncodingTable(technique
, k
, m
, c
, w
);
381 dout(10) << "[ cache tables ] creating coeff for k=" <<
382 k
<< " m=" << m
<< " c=" << c
<< " w=" << w
<< dendl
;
384 matrix
= shec_reedsolomon_coding_matrix(technique
);
386 // either our new created table is stored or if it has been
387 // created in the meanwhile the locally allocated table will be
388 // freed by setEncodingTable
389 matrix
= tcache
.setEncodingTable(technique
, k
, m
, c
, w
, matrix
);
391 dout(10) << "matrix = " << dendl
;
392 for (int i
=0; i
<m
; i
++) {
394 for (int j
=0; j
<k
; j
++) {
395 if (matrix
[i
*k
+j
] > 0) {
402 dout(10) << mat
<< dendl
;
405 matrix
= *p_enc_table
;
408 dout(10) << " [ technique ] = " <<
409 ((technique
== MULTIPLE
) ? "multiple" : "single") << dendl
;
411 ceph_assert((technique
== SINGLE
) || (technique
== MULTIPLE
));
416 // Mearged from shec.cc.
418 double ErasureCodeShec::shec_calc_recovery_efficiency1(int k
, int m1
, int m2
, int c1
, int c2
){
421 int i
, rr
, cc
, start
, end
;
424 if (m1
< c1
|| m2
< c2
) return -1;
425 if ((m1
== 0 && c1
!= 0) || (m2
== 0 && c2
!= 0)) return -1;
427 for (i
=0; i
<k
; i
++) r_eff_k
[i
] = 100000000;
430 for (rr
=0; rr
<m1
; rr
++){
431 start
= ((rr
*k
)/m1
) % k
;
432 end
= (((rr
+c1
)*k
)/m1
) % k
;
433 for (cc
=start
, first_flag
=1; first_flag
|| cc
!=end
; cc
=(cc
+1)%k
){
435 r_eff_k
[cc
] = std::min(r_eff_k
[cc
], ((rr
+c1
)*k
)/m1
- (rr
*k
)/m1
);
437 r_e1
+= ((rr
+c1
)*k
)/m1
- (rr
*k
)/m1
;
440 for (rr
=0; rr
<m2
; rr
++){
441 start
= ((rr
*k
)/m2
) % k
;
442 end
= (((rr
+c2
)*k
)/m2
) % k
;
443 for (cc
=start
, first_flag
=1; first_flag
|| cc
!=end
; cc
=(cc
+1)%k
){
445 r_eff_k
[cc
] = std::min(r_eff_k
[cc
], ((rr
+c2
)*k
)/m2
- (rr
*k
)/m2
);
447 r_e1
+= ((rr
+c2
)*k
)/m2
- (rr
*k
)/m2
;
459 int* ErasureCodeShec::shec_reedsolomon_coding_matrix(int is_single
)
462 int rr
, cc
, start
, end
;
465 if (w
!= 8 && w
!= 16 && w
!= 32) return NULL
;
468 int c1_best
= -1, m1_best
= -1;
469 double min_r_e1
= 100.0;
471 // create all multiple shec pattern and choose best.
473 for (c1
=0; c1
<= c
/2; c1
++){
474 for (m1
=0; m1
<= m
; m1
++){
478 if (m1
< c1
|| m2
< c2
) continue;
479 if ((m1
== 0 && c1
!= 0) || (m2
== 0 && c2
!= 0)) continue;
480 if ((m1
!= 0 && c1
== 0) || (m2
!= 0 && c2
== 0)) continue;
486 r_e1
= shec_calc_recovery_efficiency1(k
, m1
, m2
, c1
, c2
);
487 if (min_r_e1
- r_e1
> std::numeric_limits
<double>::epsilon() &&
508 matrix
= reed_sol_vandermonde_coding_matrix(k
, m
, w
);
510 for (rr
=0; rr
<m1
; rr
++){
511 end
= ((rr
*k
)/m1
) % k
;
512 start
= (((rr
+c1
)*k
)/m1
) % k
;
513 for (cc
=start
; cc
!=end
; cc
=(cc
+1)%k
){
514 matrix
[cc
+ rr
*k
] = 0;
518 for (rr
=0; rr
<m2
; rr
++){
519 end
= ((rr
*k
)/m2
) % k
;
520 start
= (((rr
+c2
)*k
)/m2
) % k
;
521 for (cc
=start
; cc
!=end
; cc
=(cc
+1)%k
){
522 matrix
[cc
+ (rr
+m1
)*k
] = 0;
529 int ErasureCodeShec::shec_make_decoding_matrix(bool prepare
, int *want_
, int *avails
,
530 int *decoding_matrix
, int *dm_row
, int *dm_column
,
533 int mindup
= k
+1, minp
= k
+1;
536 memset(want
, 0, sizeof(want
));
538 for (int i
= 0; i
< k
+ m
; ++i
) {
542 for (int i
= 0; i
< m
; ++i
) {
543 if (want
[i
+ k
] && !avails
[i
+ k
]) {
544 for (int j
=0; j
< k
; ++j
) {
545 if (matrix
[i
* k
+ j
] > 0) {
552 if (tcache
.getDecodingTableFromCache(decoding_matrix
,
553 dm_row
, dm_column
, minimum
,
560 for (unsigned long long pp
= 0; pp
< (1ull << m
); ++pp
) {
562 // select parity chunks
565 for (int i
=0; i
< m
; ++i
) {
566 if (pp
& (1ull << i
)) {
574 // Are selected parity chunks avail?
576 for (int i
= 0; i
< ek
&& ok
; i
++) {
577 if (!avails
[k
+p
[i
]]) {
589 for (int i
= 0; i
< k
+ m
; i
++) {
592 for (int i
= 0; i
< k
; i
++) {
596 for (int i
=0; i
< k
; i
++) {
597 if (want
[i
] && !avails
[i
]) {
602 // Parity chunks which are used to recovery erased data chunks, are added to tmprow.
603 for (int i
= 0; i
< ek
; i
++) {
604 tmprow
[k
+ p
[i
]] = 1;
605 for (int j
= 0; j
< k
; j
++) {
606 int element
= matrix
[(p
[i
]) * k
+ j
];
610 if (element
!= 0 && avails
[j
] == 1) {
616 int dup_row
= 0, dup_column
= 0, dup
= 0;
617 for (int i
= 0; i
< k
+ m
; i
++) {
623 for (int i
= 0; i
< k
; i
++) {
629 if (dup_row
!= dup_column
) {
635 for (int i
= 0; i
< k
; i
++) {
638 for (int i
= 0; i
< k
; i
++) {
644 // minimum is updated.
646 int tmpmat
[dup
* dup
];
648 for (int i
= 0, row
= 0; i
< k
+ m
; i
++) {
650 for (int j
= 0, column
= 0; j
< k
; j
++) {
653 tmpmat
[row
* dup
+ column
] = (i
== j
? 1 : 0);
655 tmpmat
[row
* dup
+ column
] = matrix
[(i
- k
) * k
+ j
];
664 int det
= calc_determinant(tmpmat
, dup
);
669 for (int i
= 0; i
< k
; i
++) {
672 for (int i
= 0; i
< k
; i
++) {
677 for (int i
=0; i
< k
+ m
; i
++) {
679 dm_row
[row_id
++] = i
;
682 for (int i
=0; i
< k
; i
++) {
684 dm_column
[column_id
++] = i
;
694 fprintf(stderr
, "shec_make_decoding_matrix(): can't find recover matrix.\n");
698 for (int i
= 0; i
< k
+ m
; i
++) {
702 for (int i
=0; i
< k
&& dm_row
[i
] != -1; i
++) {
703 minimum
[dm_row
[i
]] = 1;
706 for (int i
= 0; i
< k
; ++i
) {
707 if (want
[i
] && avails
[i
]) {
712 for (int i
= 0; i
< m
; ++i
) {
713 if (want
[k
+ i
] && avails
[k
+ i
] && !minimum
[k
+ i
]) {
714 for (int j
= 0; j
< k
; ++j
) {
715 if (matrix
[i
* k
+ j
] > 0 && !want
[j
]) {
727 int tmpmat
[mindup
* mindup
];
728 for (int i
=0; i
< mindup
; i
++) {
729 for (int j
=0; j
< mindup
; j
++) {
731 tmpmat
[i
* mindup
+ j
] = (dm_row
[i
] == dm_column
[j
] ? 1 : 0);
733 tmpmat
[i
* mindup
+ j
] = matrix
[(dm_row
[i
] - k
) * k
+ dm_column
[j
]];
737 for (int j
= 0; j
< mindup
; j
++) {
738 if (dm_row
[i
] == dm_column
[j
]) {
743 dm_row
[i
] -= (k
- mindup
);
751 int ret
= jerasure_invert_matrix(tmpmat
, decoding_matrix
, mindup
, w
);
753 tcache
.putDecodingTableToCache(decoding_matrix
, dm_row
, dm_column
, minimum
, technique
,
754 k
, m
, c
, w
, want
, avails
);
759 int ErasureCodeShec::shec_matrix_decode(int *want
, int *avails
, char **data_ptrs
,
760 char **coding_ptrs
, int size
)
762 int decoding_matrix
[k
*k
];
763 int dm_row
[k
], dm_column
[k
];
766 memset(decoding_matrix
, 0, sizeof(decoding_matrix
));
767 memset(dm_row
, -1, sizeof(dm_row
));
768 memset(dm_column
, -1, sizeof(dm_column
));
769 memset(minimum
, -1, sizeof(minimum
));
771 if (w
!= 8 && w
!= 16 && w
!= 32) return -1;
773 if (shec_make_decoding_matrix(false, want
, avails
, decoding_matrix
,
774 dm_row
, dm_column
, minimum
) < 0) {
778 // Get decoding matrix size
780 for (int i
= 0; i
< k
; i
++) {
781 if (dm_row
[i
] == -1) {
787 char *dm_data_ptrs
[dm_size
];
788 for (int i
= 0; i
< dm_size
; i
++) {
789 dm_data_ptrs
[i
] = data_ptrs
[dm_column
[i
]];
792 // Decode the data drives
793 for (int i
= 0; i
< dm_size
; i
++) {
794 if (!avails
[dm_column
[i
]]) {
795 jerasure_matrix_dotprod(dm_size
, w
, decoding_matrix
+ (i
* dm_size
),
796 dm_row
, i
, dm_data_ptrs
, coding_ptrs
, size
);
800 // Re-encode any erased coding devices
801 for (int i
= 0; i
< m
; i
++) {
802 if (want
[k
+i
] && !avails
[k
+i
]) {
803 jerasure_matrix_dotprod(k
, w
, matrix
+ (i
* k
), NULL
, i
+k
,
804 data_ptrs
, coding_ptrs
, size
);