]>
git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/shec/ErasureCodeShec.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 FUJITSU LIMITED
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Takanori Nakao <nakao.takanori@jp.fujitsu.com>
11 * Author: Takeshi Miyamae <miyamae.takeshi@jp.fujitsu.com>
12 * Author: Loic Dachary <loic@dachary.org>
14 * This library is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU Lesser General Public
16 * License as published by the Free Software Foundation; either
17 * version 2.1 of the License, or (at your option) any later version.
26 #include "common/debug.h"
27 #include "ErasureCodeShec.h"
29 #include "jerasure/include/jerasure.h"
30 #include "jerasure/include/galois.h"
32 extern int calc_determinant(int *matrix
, int dim
);
33 extern int* reed_sol_vandermonde_coding_matrix(int k
, int m
, int w
);
36 #define dout_context g_ceph_context
37 #define dout_subsys ceph_subsys_osd
39 #define dout_prefix _prefix(_dout)
45 static ostream
& _prefix(std::ostream
* _dout
)
47 return *_dout
<< "ErasureCodeShec: ";
50 int ErasureCodeShec::init(ErasureCodeProfile
&profile
,
54 err
|= parse(profile
);
58 return ErasureCode::init(profile
, ss
);
61 unsigned int ErasureCodeShec::get_chunk_size(unsigned int object_size
) const
63 unsigned alignment
= get_alignment();
64 unsigned tail
= object_size
% alignment
;
65 unsigned padded_length
= object_size
+ ( tail
? ( alignment
- tail
) : 0 );
67 ceph_assert(padded_length
% k
== 0);
68 return padded_length
/ k
;
71 int ErasureCodeShec::_minimum_to_decode(const set
<int> &want_to_read
,
72 const set
<int> &available_chunks
,
73 set
<int> *minimum_chunks
)
75 if (!minimum_chunks
) return -EINVAL
;
77 for (set
<int>::iterator it
= available_chunks
.begin(); it
!= available_chunks
.end(); ++it
){
78 if (*it
< 0 || k
+m
<= *it
) return -EINVAL
;
81 for (set
<int>::iterator it
= want_to_read
.begin(); it
!= want_to_read
.end(); ++it
){
82 if (*it
< 0 || k
+m
<= *it
) return -EINVAL
;
89 memset(want
, 0, sizeof(want
));
90 memset(avails
, 0, sizeof(avails
));
91 memset(minimum
, 0, sizeof(minimum
));
92 (*minimum_chunks
).clear();
94 for (set
<int>::const_iterator i
= want_to_read
.begin();
95 i
!= want_to_read
.end();
100 for (set
<int>::const_iterator i
= available_chunks
.begin();
101 i
!= available_chunks
.end();
107 int decoding_matrix
[k
*k
];
110 memset(decoding_matrix
, 0, sizeof(decoding_matrix
));
111 memset(dm_row
, 0, sizeof(dm_row
));
112 memset(dm_column
, 0, sizeof(dm_column
));
113 if (shec_make_decoding_matrix(true, want
, avails
, decoding_matrix
, dm_row
, dm_column
, minimum
) < 0) {
118 for (int i
= 0; i
< k
+ m
; i
++) {
119 if (minimum
[i
] == 1) minimum_chunks
->insert(i
);
125 int ErasureCodeShec::minimum_to_decode_with_cost(const set
<int> &want_to_read
,
126 const map
<int, int> &available
,
127 set
<int> *minimum_chunks
)
129 set
<int> available_chunks
;
131 for (map
<int, int>::const_iterator i
= available
.begin();
132 i
!= available
.end();
134 available_chunks
.insert(i
->first
);
136 return _minimum_to_decode(want_to_read
, available_chunks
, minimum_chunks
);
139 int ErasureCodeShec::encode(const set
<int> &want_to_encode
,
140 const bufferlist
&in
,
141 map
<int, bufferlist
> *encoded
)
143 unsigned int k
= get_data_chunk_count();
144 unsigned int m
= get_chunk_count() - k
;
147 if (!encoded
|| !encoded
->empty()){
151 int err
= encode_prepare(in
, *encoded
);
154 encode_chunks(want_to_encode
, encoded
);
155 for (unsigned int i
= 0; i
< k
+ m
; i
++) {
156 if (want_to_encode
.count(i
) == 0)
162 int ErasureCodeShec::encode_chunks(const set
<int> &want_to_encode
,
163 map
<int, bufferlist
> *encoded
)
166 for (int i
= 0; i
< k
+ m
; i
++){
167 chunks
[i
] = (*encoded
)[i
].c_str();
169 shec_encode(&chunks
[0], &chunks
[k
], (*encoded
)[0].length());
173 int ErasureCodeShec::_decode(const set
<int> &want_to_read
,
174 const map
<int, bufferlist
> &chunks
,
175 map
<int, bufferlist
> *decoded
)
179 if (!decoded
|| !decoded
->empty()){
183 have
.reserve(chunks
.size());
184 for (map
<int, bufferlist
>::const_iterator i
= chunks
.begin();
187 have
.push_back(i
->first
);
190 have
.begin(), have
.end(), want_to_read
.begin(), want_to_read
.end())) {
191 for (set
<int>::iterator i
= want_to_read
.begin();
192 i
!= want_to_read
.end();
194 (*decoded
)[*i
] = chunks
.find(*i
)->second
;
198 unsigned int k
= get_data_chunk_count();
199 unsigned int m
= get_chunk_count() - k
;
200 unsigned blocksize
= (*chunks
.begin()).second
.length();
201 for (unsigned int i
= 0; i
< k
+ m
; i
++) {
202 if (chunks
.find(i
) == chunks
.end()) {
204 bufferptr
ptr(buffer::create_aligned(blocksize
, SIMD_ALIGN
));
206 tmp
.claim_append((*decoded
)[i
]);
207 (*decoded
)[i
].swap(tmp
);
209 (*decoded
)[i
] = chunks
.find(i
)->second
;
210 (*decoded
)[i
].rebuild_aligned(SIMD_ALIGN
);
213 return decode_chunks(want_to_read
, chunks
, decoded
);
216 int ErasureCodeShec::decode_chunks(const set
<int> &want_to_read
,
217 const map
<int, bufferlist
> &chunks
,
218 map
<int, bufferlist
> *decoded
)
220 unsigned blocksize
= (*chunks
.begin()).second
.length();
222 int erased_count
= 0;
227 for (int i
= 0; i
< k
+ m
; i
++) {
229 if (chunks
.find(i
) == chunks
.end()) {
230 if (want_to_read
.count(i
) > 0) {
239 data
[i
] = (*decoded
)[i
].c_str();
241 coding
[i
- k
] = (*decoded
)[i
].c_str();
244 if (erased_count
> 0) {
245 return shec_decode(erased
, avails
, data
, coding
, blocksize
);
252 // ErasureCodeShecReedSolomonVandermonde
255 void ErasureCodeShecReedSolomonVandermonde::shec_encode(char **data
,
259 jerasure_matrix_encode(k
, m
, w
, matrix
, data
, coding
, blocksize
);
262 int ErasureCodeShecReedSolomonVandermonde::shec_decode(int *erased
,
268 return shec_matrix_decode(erased
, avails
, data
, coding
, blocksize
);
271 unsigned ErasureCodeShecReedSolomonVandermonde::get_alignment() const
273 return k
*w
*sizeof(int);
276 int ErasureCodeShecReedSolomonVandermonde::parse(const ErasureCodeProfile
&profile
)
280 if (profile
.find("k") == profile
.end() &&
281 profile
.find("m") == profile
.end() &&
282 profile
.find("c") == profile
.end()){
283 dout(10) << "(k, m, c) default to " << "(" << DEFAULT_K
284 << ", " << DEFAULT_M
<< ", " << DEFAULT_C
<< ")" << dendl
;
285 k
= DEFAULT_K
; m
= DEFAULT_M
; c
= DEFAULT_C
;
286 } else if (profile
.find("k") == profile
.end() ||
287 profile
.find("m") == profile
.end() ||
288 profile
.find("c") == profile
.end()){
289 dout(10) << "(k, m, c) must be chosen" << dendl
;
292 std::string err_k
, err_m
, err_c
, value_k
, value_m
, value_c
;
293 value_k
= profile
.find("k")->second
;
294 value_m
= profile
.find("m")->second
;
295 value_c
= profile
.find("c")->second
;
296 k
= strict_strtol(value_k
.c_str(), 10, &err_k
);
297 m
= strict_strtol(value_m
.c_str(), 10, &err_m
);
298 c
= strict_strtol(value_c
.c_str(), 10, &err_c
);
300 if (!err_k
.empty() || !err_m
.empty() || !err_c
.empty()){
302 derr
<< "could not convert k=" << value_k
<< "to int" << dendl
;
303 } else if (!err_m
.empty()){
304 derr
<< "could not convert m=" << value_m
<< "to int" << dendl
;
305 } else if (!err_c
.empty()){
306 derr
<< "could not convert c=" << value_c
<< "to int" << dendl
;
311 << " must be a positive number" << dendl
;
315 << " must be a positive number" << dendl
;
319 << " must be a positive number" << dendl
;
323 << " must be less than or equal to m=" << m
<< dendl
;
327 << " must be less than or equal to 12" << dendl
;
329 } else if (k
+m
> 20){
330 derr
<< "k+m=" << k
+m
331 << " must be less than or equal to 20" << dendl
;
335 << " must be less than or equal to k=" << k
<< dendl
;
341 derr
<< "(k, m, c)=(" << k
<< ", " << m
<< ", " << c
342 << ") is not a valid parameter." << dendl
;
346 dout(10) << "(k, m, c) set to " << "(" << k
<< ", " << m
<< ", "
350 if (profile
.find("w") == profile
.end()){
351 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
354 std::string err_w
, value_w
;
355 value_w
= profile
.find("w")->second
;
356 w
= strict_strtol(value_w
.c_str(), 10, &err_w
);
359 derr
<< "could not convert w=" << value_w
<< "to int" << dendl
;
360 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
363 } else if (w
!= 8 && w
!= 16 && w
!= 32) {
365 << " must be one of {8, 16, 32}" << dendl
;
366 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
370 dout(10) << "w set to " << w
<< dendl
;
376 void ErasureCodeShecReedSolomonVandermonde::prepare()
378 // setup shared encoding table
380 tcache
.getEncodingTable(technique
, k
, m
, c
, w
);
383 dout(10) << "[ cache tables ] creating coeff for k=" <<
384 k
<< " m=" << m
<< " c=" << c
<< " w=" << w
<< dendl
;
386 matrix
= shec_reedsolomon_coding_matrix(technique
);
388 // either our new created table is stored or if it has been
389 // created in the meanwhile the locally allocated table will be
390 // freed by setEncodingTable
391 matrix
= tcache
.setEncodingTable(technique
, k
, m
, c
, w
, matrix
);
393 dout(10) << "matrix = " << dendl
;
394 for (int i
=0; i
<m
; i
++) {
396 for (int j
=0; j
<k
; j
++) {
397 if (matrix
[i
*k
+j
] > 0) {
404 dout(10) << mat
<< dendl
;
407 matrix
= *p_enc_table
;
410 dout(10) << " [ technique ] = " <<
411 ((technique
== MULTIPLE
) ? "multiple" : "single") << dendl
;
413 ceph_assert((technique
== SINGLE
) || (technique
== MULTIPLE
));
418 // Mearged from shec.cc.
420 double ErasureCodeShec::shec_calc_recovery_efficiency1(int k
, int m1
, int m2
, int c1
, int c2
){
423 int i
, rr
, cc
, start
, end
;
426 if (m1
< c1
|| m2
< c2
) return -1;
427 if ((m1
== 0 && c1
!= 0) || (m2
== 0 && c2
!= 0)) return -1;
429 for (i
=0; i
<k
; i
++) r_eff_k
[i
] = 100000000;
432 for (rr
=0; rr
<m1
; rr
++){
433 start
= ((rr
*k
)/m1
) % k
;
434 end
= (((rr
+c1
)*k
)/m1
) % k
;
435 for (cc
=start
, first_flag
=1; first_flag
|| cc
!=end
; cc
=(cc
+1)%k
){
437 r_eff_k
[cc
] = std::min(r_eff_k
[cc
], ((rr
+c1
)*k
)/m1
- (rr
*k
)/m1
);
439 r_e1
+= ((rr
+c1
)*k
)/m1
- (rr
*k
)/m1
;
442 for (rr
=0; rr
<m2
; rr
++){
443 start
= ((rr
*k
)/m2
) % k
;
444 end
= (((rr
+c2
)*k
)/m2
) % k
;
445 for (cc
=start
, first_flag
=1; first_flag
|| cc
!=end
; cc
=(cc
+1)%k
){
447 r_eff_k
[cc
] = std::min(r_eff_k
[cc
], ((rr
+c2
)*k
)/m2
- (rr
*k
)/m2
);
449 r_e1
+= ((rr
+c2
)*k
)/m2
- (rr
*k
)/m2
;
461 int* ErasureCodeShec::shec_reedsolomon_coding_matrix(int is_single
)
464 int rr
, cc
, start
, end
;
467 if (w
!= 8 && w
!= 16 && w
!= 32) return NULL
;
470 int c1_best
= -1, m1_best
= -1;
471 double min_r_e1
= 100.0;
473 // create all multiple shec pattern and choose best.
475 for (c1
=0; c1
<= c
/2; c1
++){
476 for (m1
=0; m1
<= m
; m1
++){
480 if (m1
< c1
|| m2
< c2
) continue;
481 if ((m1
== 0 && c1
!= 0) || (m2
== 0 && c2
!= 0)) continue;
482 if ((m1
!= 0 && c1
== 0) || (m2
!= 0 && c2
== 0)) continue;
488 r_e1
= shec_calc_recovery_efficiency1(k
, m1
, m2
, c1
, c2
);
489 if (min_r_e1
- r_e1
> std::numeric_limits
<double>::epsilon() &&
510 matrix
= reed_sol_vandermonde_coding_matrix(k
, m
, w
);
512 for (rr
=0; rr
<m1
; rr
++){
513 end
= ((rr
*k
)/m1
) % k
;
514 start
= (((rr
+c1
)*k
)/m1
) % k
;
515 for (cc
=start
; cc
!=end
; cc
=(cc
+1)%k
){
516 matrix
[cc
+ rr
*k
] = 0;
520 for (rr
=0; rr
<m2
; rr
++){
521 end
= ((rr
*k
)/m2
) % k
;
522 start
= (((rr
+c2
)*k
)/m2
) % k
;
523 for (cc
=start
; cc
!=end
; cc
=(cc
+1)%k
){
524 matrix
[cc
+ (rr
+m1
)*k
] = 0;
531 int ErasureCodeShec::shec_make_decoding_matrix(bool prepare
, int *want_
, int *avails
,
532 int *decoding_matrix
, int *dm_row
, int *dm_column
,
535 int mindup
= k
+1, minp
= k
+1;
538 memset(want
, 0, sizeof(want
));
540 for (int i
= 0; i
< k
+ m
; ++i
) {
544 for (int i
= 0; i
< m
; ++i
) {
545 if (want
[i
+ k
] && !avails
[i
+ k
]) {
546 for (int j
=0; j
< k
; ++j
) {
547 if (matrix
[i
* k
+ j
] > 0) {
554 if (tcache
.getDecodingTableFromCache(decoding_matrix
,
555 dm_row
, dm_column
, minimum
,
562 for (unsigned long long pp
= 0; pp
< (1ull << m
); ++pp
) {
564 // select parity chunks
567 for (int i
=0; i
< m
; ++i
) {
568 if (pp
& (1ull << i
)) {
576 // Are selected parity chunks avail?
578 for (int i
= 0; i
< ek
&& ok
; i
++) {
579 if (!avails
[k
+p
[i
]]) {
591 for (int i
= 0; i
< k
+ m
; i
++) {
594 for (int i
= 0; i
< k
; i
++) {
598 for (int i
=0; i
< k
; i
++) {
599 if (want
[i
] && !avails
[i
]) {
604 // Parity chunks which are used to recovery erased data chunks, are added to tmprow.
605 for (int i
= 0; i
< ek
; i
++) {
606 tmprow
[k
+ p
[i
]] = 1;
607 for (int j
= 0; j
< k
; j
++) {
608 int element
= matrix
[(p
[i
]) * k
+ j
];
612 if (element
!= 0 && avails
[j
] == 1) {
618 int dup_row
= 0, dup_column
= 0, dup
= 0;
619 for (int i
= 0; i
< k
+ m
; i
++) {
625 for (int i
= 0; i
< k
; i
++) {
631 if (dup_row
!= dup_column
) {
637 for (int i
= 0; i
< k
; i
++) {
640 for (int i
= 0; i
< k
; i
++) {
646 // minimum is updated.
648 int tmpmat
[dup
* dup
];
650 for (int i
= 0, row
= 0; i
< k
+ m
; i
++) {
652 for (int j
= 0, column
= 0; j
< k
; j
++) {
655 tmpmat
[row
* dup
+ column
] = (i
== j
? 1 : 0);
657 tmpmat
[row
* dup
+ column
] = matrix
[(i
- k
) * k
+ j
];
666 int det
= calc_determinant(tmpmat
, dup
);
671 for (int i
= 0; i
< k
; i
++) {
674 for (int i
= 0; i
< k
; i
++) {
679 for (int i
=0; i
< k
+ m
; i
++) {
681 dm_row
[row_id
++] = i
;
684 for (int i
=0; i
< k
; i
++) {
686 dm_column
[column_id
++] = i
;
696 fprintf(stderr
, "shec_make_decoding_matrix(): can't find recover matrix.\n");
700 for (int i
= 0; i
< k
+ m
; i
++) {
704 for (int i
=0; i
< k
&& dm_row
[i
] != -1; i
++) {
705 minimum
[dm_row
[i
]] = 1;
708 for (int i
= 0; i
< k
; ++i
) {
709 if (want
[i
] && avails
[i
]) {
714 for (int i
= 0; i
< m
; ++i
) {
715 if (want
[k
+ i
] && avails
[k
+ i
] && !minimum
[k
+ i
]) {
716 for (int j
= 0; j
< k
; ++j
) {
717 if (matrix
[i
* k
+ j
] > 0 && !want
[j
]) {
729 int tmpmat
[mindup
* mindup
];
730 for (int i
=0; i
< mindup
; i
++) {
731 for (int j
=0; j
< mindup
; j
++) {
733 tmpmat
[i
* mindup
+ j
] = (dm_row
[i
] == dm_column
[j
] ? 1 : 0);
735 tmpmat
[i
* mindup
+ j
] = matrix
[(dm_row
[i
] - k
) * k
+ dm_column
[j
]];
739 for (int j
= 0; j
< mindup
; j
++) {
740 if (dm_row
[i
] == dm_column
[j
]) {
745 dm_row
[i
] -= (k
- mindup
);
753 int ret
= jerasure_invert_matrix(tmpmat
, decoding_matrix
, mindup
, w
);
755 tcache
.putDecodingTableToCache(decoding_matrix
, dm_row
, dm_column
, minimum
, technique
,
756 k
, m
, c
, w
, want
, avails
);
761 int ErasureCodeShec::shec_matrix_decode(int *want
, int *avails
, char **data_ptrs
,
762 char **coding_ptrs
, int size
)
764 int decoding_matrix
[k
*k
];
765 int dm_row
[k
], dm_column
[k
];
768 memset(decoding_matrix
, 0, sizeof(decoding_matrix
));
769 memset(dm_row
, -1, sizeof(dm_row
));
770 memset(dm_column
, -1, sizeof(dm_column
));
771 memset(minimum
, -1, sizeof(minimum
));
773 if (w
!= 8 && w
!= 16 && w
!= 32) return -1;
775 if (shec_make_decoding_matrix(false, want
, avails
, decoding_matrix
,
776 dm_row
, dm_column
, minimum
) < 0) {
780 // Get decoding matrix size
782 for (int i
= 0; i
< k
; i
++) {
783 if (dm_row
[i
] == -1) {
789 char *dm_data_ptrs
[dm_size
];
790 for (int i
= 0; i
< dm_size
; i
++) {
791 dm_data_ptrs
[i
] = data_ptrs
[dm_column
[i
]];
794 // Decode the data drives
795 for (int i
= 0; i
< dm_size
; i
++) {
796 if (!avails
[dm_column
[i
]]) {
797 jerasure_matrix_dotprod(dm_size
, w
, decoding_matrix
+ (i
* dm_size
),
798 dm_row
, i
, dm_data_ptrs
, coding_ptrs
, size
);
802 // Re-encode any erased coding devices
803 for (int i
= 0; i
< m
; i
++) {
804 if (want
[k
+i
] && !avails
[k
+i
]) {
805 jerasure_matrix_dotprod(k
, w
, matrix
+ (i
* k
), NULL
, i
+k
,
806 data_ptrs
, coding_ptrs
, size
);