]>
git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/shec/ErasureCodeShec.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 FUJITSU LIMITED
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 * Copyright (C) 2014 Red Hat <contact@redhat.com>
10 * Author: Takanori Nakao <nakao.takanori@jp.fujitsu.com>
11 * Author: Takeshi Miyamae <miyamae.takeshi@jp.fujitsu.com>
12 * Author: Loic Dachary <loic@dachary.org>
14 * This library is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU Lesser General Public
16 * License as published by the Free Software Foundation; either
17 * version 2.1 of the License, or (at your option) any later version.
28 #include "common/debug.h"
29 #include "ErasureCodeShec.h"
31 #include "jerasure/include/jerasure.h"
32 #include "jerasure/include/galois.h"
34 extern int calc_determinant(int *matrix
, int dim
);
35 extern int* reed_sol_vandermonde_coding_matrix(int k
, int m
, int w
);
38 #define dout_context g_ceph_context
39 #define dout_subsys ceph_subsys_osd
41 #define dout_prefix _prefix(_dout)
43 static ostream
& _prefix(std::ostream
* _dout
)
45 return *_dout
<< "ErasureCodeShec: ";
48 int ErasureCodeShec::init(ErasureCodeProfile
&profile
,
52 err
|= parse(profile
);
56 return ErasureCode::init(profile
, ss
);
59 unsigned int ErasureCodeShec::get_chunk_size(unsigned int object_size
) const
61 unsigned alignment
= get_alignment();
62 unsigned tail
= object_size
% alignment
;
63 unsigned padded_length
= object_size
+ ( tail
? ( alignment
- tail
) : 0 );
65 assert(padded_length
% k
== 0);
66 return padded_length
/ k
;
69 int ErasureCodeShec::minimum_to_decode(const set
<int> &want_to_read
,
70 const set
<int> &available_chunks
,
71 set
<int> *minimum_chunks
)
73 if (!minimum_chunks
) return -EINVAL
;
75 for (set
<int>::iterator it
= available_chunks
.begin(); it
!= available_chunks
.end(); ++it
){
76 if (*it
< 0 || k
+m
<= *it
) return -EINVAL
;
79 for (set
<int>::iterator it
= want_to_read
.begin(); it
!= want_to_read
.end(); ++it
){
80 if (*it
< 0 || k
+m
<= *it
) return -EINVAL
;
87 memset(want
, 0, sizeof(want
));
88 memset(avails
, 0, sizeof(avails
));
89 memset(minimum
, 0, sizeof(minimum
));
90 (*minimum_chunks
).clear();
92 for (set
<int>::const_iterator i
= want_to_read
.begin();
93 i
!= want_to_read
.end();
98 for (set
<int>::const_iterator i
= available_chunks
.begin();
99 i
!= available_chunks
.end();
105 int decoding_matrix
[k
*k
];
108 memset(decoding_matrix
, 0, sizeof(decoding_matrix
));
109 memset(dm_row
, 0, sizeof(dm_row
));
110 memset(dm_column
, 0, sizeof(dm_column
));
111 if (shec_make_decoding_matrix(true, want
, avails
, decoding_matrix
, dm_row
, dm_column
, minimum
) < 0) {
116 for (int i
= 0; i
< k
+ m
; i
++) {
117 if (minimum
[i
] == 1) minimum_chunks
->insert(i
);
123 int ErasureCodeShec::minimum_to_decode_with_cost(const set
<int> &want_to_read
,
124 const map
<int, int> &available
,
125 set
<int> *minimum_chunks
)
127 set
<int> available_chunks
;
129 for (map
<int, int>::const_iterator i
= available
.begin();
130 i
!= available
.end();
132 available_chunks
.insert(i
->first
);
134 return minimum_to_decode(want_to_read
, available_chunks
, minimum_chunks
);
137 int ErasureCodeShec::encode(const set
<int> &want_to_encode
,
138 const bufferlist
&in
,
139 map
<int, bufferlist
> *encoded
)
141 unsigned int k
= get_data_chunk_count();
142 unsigned int m
= get_chunk_count() - k
;
145 if (!encoded
|| !encoded
->empty()){
149 int err
= encode_prepare(in
, *encoded
);
152 encode_chunks(want_to_encode
, encoded
);
153 for (unsigned int i
= 0; i
< k
+ m
; i
++) {
154 if (want_to_encode
.count(i
) == 0)
160 int ErasureCodeShec::encode_chunks(const set
<int> &want_to_encode
,
161 map
<int, bufferlist
> *encoded
)
164 for (int i
= 0; i
< k
+ m
; i
++){
165 chunks
[i
] = (*encoded
)[i
].c_str();
167 shec_encode(&chunks
[0], &chunks
[k
], (*encoded
)[0].length());
171 int ErasureCodeShec::decode(const set
<int> &want_to_read
,
172 const map
<int, bufferlist
> &chunks
,
173 map
<int, bufferlist
> *decoded
)
177 if (!decoded
|| !decoded
->empty()){
181 have
.reserve(chunks
.size());
182 for (map
<int, bufferlist
>::const_iterator i
= chunks
.begin();
185 have
.push_back(i
->first
);
188 have
.begin(), have
.end(), want_to_read
.begin(), want_to_read
.end())) {
189 for (set
<int>::iterator i
= want_to_read
.begin();
190 i
!= want_to_read
.end();
192 (*decoded
)[*i
] = chunks
.find(*i
)->second
;
196 unsigned int k
= get_data_chunk_count();
197 unsigned int m
= get_chunk_count() - k
;
198 unsigned blocksize
= (*chunks
.begin()).second
.length();
199 for (unsigned int i
= 0; i
< k
+ m
; i
++) {
200 if (chunks
.find(i
) == chunks
.end()) {
201 bufferptr
ptr(buffer::create_aligned(blocksize
, SIMD_ALIGN
));
202 (*decoded
)[i
].push_front(ptr
);
204 (*decoded
)[i
] = chunks
.find(i
)->second
;
205 (*decoded
)[i
].rebuild_aligned(SIMD_ALIGN
);
208 return decode_chunks(want_to_read
, chunks
, decoded
);
211 int ErasureCodeShec::decode_chunks(const set
<int> &want_to_read
,
212 const map
<int, bufferlist
> &chunks
,
213 map
<int, bufferlist
> *decoded
)
215 unsigned blocksize
= (*chunks
.begin()).second
.length();
217 int erased_count
= 0;
222 for (int i
= 0; i
< k
+ m
; i
++) {
224 if (chunks
.find(i
) == chunks
.end()) {
225 if (want_to_read
.count(i
) > 0) {
234 data
[i
] = (*decoded
)[i
].c_str();
236 coding
[i
- k
] = (*decoded
)[i
].c_str();
239 if (erased_count
> 0) {
240 return shec_decode(erased
, avails
, data
, coding
, blocksize
);
247 // ErasureCodeShecReedSolomonVandermonde
250 void ErasureCodeShecReedSolomonVandermonde::shec_encode(char **data
,
254 jerasure_matrix_encode(k
, m
, w
, matrix
, data
, coding
, blocksize
);
257 int ErasureCodeShecReedSolomonVandermonde::shec_decode(int *erased
,
263 return shec_matrix_decode(erased
, avails
, data
, coding
, blocksize
);
266 unsigned ErasureCodeShecReedSolomonVandermonde::get_alignment() const
268 return k
*w
*sizeof(int);
271 int ErasureCodeShecReedSolomonVandermonde::parse(const ErasureCodeProfile
&profile
)
275 if (profile
.find("k") == profile
.end() &&
276 profile
.find("m") == profile
.end() &&
277 profile
.find("c") == profile
.end()){
278 dout(10) << "(k, m, c) default to " << "(" << DEFAULT_K
279 << ", " << DEFAULT_M
<< ", " << DEFAULT_C
<< ")" << dendl
;
280 k
= DEFAULT_K
; m
= DEFAULT_M
; c
= DEFAULT_C
;
281 } else if (profile
.find("k") == profile
.end() ||
282 profile
.find("m") == profile
.end() ||
283 profile
.find("c") == profile
.end()){
284 dout(10) << "(k, m, c) must be chosen" << dendl
;
287 std::string err_k
, err_m
, err_c
, value_k
, value_m
, value_c
;
288 value_k
= profile
.find("k")->second
;
289 value_m
= profile
.find("m")->second
;
290 value_c
= profile
.find("c")->second
;
291 k
= strict_strtol(value_k
.c_str(), 10, &err_k
);
292 m
= strict_strtol(value_m
.c_str(), 10, &err_m
);
293 c
= strict_strtol(value_c
.c_str(), 10, &err_c
);
295 if (!err_k
.empty() || !err_m
.empty() || !err_c
.empty()){
297 derr
<< "could not convert k=" << value_k
<< "to int" << dendl
;
298 } else if (!err_m
.empty()){
299 derr
<< "could not convert m=" << value_m
<< "to int" << dendl
;
300 } else if (!err_c
.empty()){
301 derr
<< "could not convert c=" << value_c
<< "to int" << dendl
;
306 << " must be a positive number" << dendl
;
310 << " must be a positive number" << dendl
;
314 << " must be a positive number" << dendl
;
318 << " must be less than or equal to m=" << m
<< dendl
;
322 << " must be less than or equal to 12" << dendl
;
324 } else if (k
+m
> 20){
325 derr
<< "k+m=" << k
+m
326 << " must be less than or equal to 20" << dendl
;
330 << " must be less than or equal to k=" << k
<< dendl
;
336 derr
<< "(k, m, c)=(" << k
<< ", " << m
<< ", " << c
337 << ") is not a valid parameter." << dendl
;
341 dout(10) << "(k, m, c) set to " << "(" << k
<< ", " << m
<< ", "
345 if (profile
.find("w") == profile
.end()){
346 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
349 std::string err_w
, value_w
;
350 value_w
= profile
.find("w")->second
;
351 w
= strict_strtol(value_w
.c_str(), 10, &err_w
);
354 derr
<< "could not convert w=" << value_w
<< "to int" << dendl
;
355 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
358 } else if (w
!= 8 && w
!= 16 && w
!= 32) {
360 << " must be one of {8, 16, 32}" << dendl
;
361 dout(10) << "w default to " << DEFAULT_W
<< dendl
;
365 dout(10) << "w set to " << w
<< dendl
;
371 void ErasureCodeShecReedSolomonVandermonde::prepare()
373 // setup shared encoding table
375 tcache
.getEncodingTable(technique
, k
, m
, c
, w
);
378 dout(10) << "[ cache tables ] creating coeff for k=" <<
379 k
<< " m=" << m
<< " c=" << c
<< " w=" << w
<< dendl
;
381 matrix
= shec_reedsolomon_coding_matrix(technique
);
383 // either our new created table is stored or if it has been
384 // created in the meanwhile the locally allocated table will be
385 // freed by setEncodingTable
386 matrix
= tcache
.setEncodingTable(technique
, k
, m
, c
, w
, matrix
);
388 dout(10) << "matrix = " << dendl
;
389 for (int i
=0; i
<m
; i
++) {
391 for (int j
=0; j
<k
; j
++) {
392 if (matrix
[i
*k
+j
] > 0) {
399 dout(10) << mat
<< dendl
;
402 matrix
= *p_enc_table
;
405 dout(10) << " [ technique ] = " <<
406 ((technique
== MULTIPLE
) ? "multiple" : "single") << dendl
;
408 assert((technique
== SINGLE
) || (technique
== MULTIPLE
));
413 // Mearged from shec.cc.
415 double ErasureCodeShec::shec_calc_recovery_efficiency1(int k
, int m1
, int m2
, int c1
, int c2
){
418 int i
, rr
, cc
, start
, end
;
421 if (m1
< c1
|| m2
< c2
) return -1;
422 if ((m1
== 0 && c1
!= 0) || (m2
== 0 && c2
!= 0)) return -1;
424 for (i
=0; i
<k
; i
++) r_eff_k
[i
] = 100000000;
427 for (rr
=0; rr
<m1
; rr
++){
428 start
= ((rr
*k
)/m1
) % k
;
429 end
= (((rr
+c1
)*k
)/m1
) % k
;
430 for (cc
=start
, first_flag
=1; first_flag
|| cc
!=end
; cc
=(cc
+1)%k
){
432 r_eff_k
[cc
] = std::min(r_eff_k
[cc
], ((rr
+c1
)*k
)/m1
- (rr
*k
)/m1
);
434 r_e1
+= ((rr
+c1
)*k
)/m1
- (rr
*k
)/m1
;
437 for (rr
=0; rr
<m2
; rr
++){
438 start
= ((rr
*k
)/m2
) % k
;
439 end
= (((rr
+c2
)*k
)/m2
) % k
;
440 for (cc
=start
, first_flag
=1; first_flag
|| cc
!=end
; cc
=(cc
+1)%k
){
442 r_eff_k
[cc
] = std::min(r_eff_k
[cc
], ((rr
+c2
)*k
)/m2
- (rr
*k
)/m2
);
444 r_e1
+= ((rr
+c2
)*k
)/m2
- (rr
*k
)/m2
;
456 int* ErasureCodeShec::shec_reedsolomon_coding_matrix(int is_single
)
459 int rr
, cc
, start
, end
;
462 if (w
!= 8 && w
!= 16 && w
!= 32) return NULL
;
465 int c1_best
= -1, m1_best
= -1;
466 double min_r_e1
= 100.0;
468 // create all multiple shec pattern and choose best.
470 for (c1
=0; c1
<= c
/2; c1
++){
471 for (m1
=0; m1
<= m
; m1
++){
475 if (m1
< c1
|| m2
< c2
) continue;
476 if ((m1
== 0 && c1
!= 0) || (m2
== 0 && c2
!= 0)) continue;
477 if ((m1
!= 0 && c1
== 0) || (m2
!= 0 && c2
== 0)) continue;
483 r_e1
= shec_calc_recovery_efficiency1(k
, m1
, m2
, c1
, c2
);
484 if (min_r_e1
- r_e1
> std::numeric_limits
<double>::epsilon() &&
505 matrix
= reed_sol_vandermonde_coding_matrix(k
, m
, w
);
507 for (rr
=0; rr
<m1
; rr
++){
508 end
= ((rr
*k
)/m1
) % k
;
509 start
= (((rr
+c1
)*k
)/m1
) % k
;
510 for (cc
=start
; cc
!=end
; cc
=(cc
+1)%k
){
511 matrix
[cc
+ rr
*k
] = 0;
515 for (rr
=0; rr
<m2
; rr
++){
516 end
= ((rr
*k
)/m2
) % k
;
517 start
= (((rr
+c2
)*k
)/m2
) % k
;
518 for (cc
=start
; cc
!=end
; cc
=(cc
+1)%k
){
519 matrix
[cc
+ (rr
+m1
)*k
] = 0;
526 int ErasureCodeShec::shec_make_decoding_matrix(bool prepare
, int *want_
, int *avails
,
527 int *decoding_matrix
, int *dm_row
, int *dm_column
,
530 int mindup
= k
+1, minp
= k
+1;
533 memset(want
, 0, sizeof(want
));
535 for (int i
= 0; i
< k
+ m
; ++i
) {
539 for (int i
= 0; i
< m
; ++i
) {
540 if (want
[i
+ k
] && !avails
[i
+ k
]) {
541 for (int j
=0; j
< k
; ++j
) {
542 if (matrix
[i
* k
+ j
] > 0) {
549 if (tcache
.getDecodingTableFromCache(decoding_matrix
,
550 dm_row
, dm_column
, minimum
,
557 for (unsigned long long pp
= 0; pp
< (1ull << m
); ++pp
) {
559 // select parity chunks
562 for (int i
=0; i
< m
; ++i
) {
563 if (pp
& (1ull << i
)) {
571 // Are selected parity chunks avail?
573 for (int i
= 0; i
< ek
&& ok
; i
++) {
574 if (!avails
[k
+p
[i
]]) {
586 for (int i
= 0; i
< k
+ m
; i
++) {
589 for (int i
= 0; i
< k
; i
++) {
593 for (int i
=0; i
< k
; i
++) {
594 if (want
[i
] && !avails
[i
]) {
599 // Parity chunks which are used to recovery erased data chunks, are added to tmprow.
600 for (int i
= 0; i
< ek
; i
++) {
601 tmprow
[k
+ p
[i
]] = 1;
602 for (int j
= 0; j
< k
; j
++) {
603 int element
= matrix
[(p
[i
]) * k
+ j
];
607 if (element
!= 0 && avails
[j
] == 1) {
613 int dup_row
= 0, dup_column
= 0, dup
= 0;
614 for (int i
= 0; i
< k
+ m
; i
++) {
620 for (int i
= 0; i
< k
; i
++) {
626 if (dup_row
!= dup_column
) {
632 for (int i
= 0; i
< k
; i
++) {
635 for (int i
= 0; i
< k
; i
++) {
641 // minimum is updated.
643 int tmpmat
[dup
* dup
];
645 for (int i
= 0, row
= 0; i
< k
+ m
; i
++) {
647 for (int j
= 0, column
= 0; j
< k
; j
++) {
650 tmpmat
[row
* dup
+ column
] = (i
== j
? 1 : 0);
652 tmpmat
[row
* dup
+ column
] = matrix
[(i
- k
) * k
+ j
];
661 int det
= calc_determinant(tmpmat
, dup
);
666 for (int i
= 0; i
< k
; i
++) {
669 for (int i
= 0; i
< k
; i
++) {
674 for (int i
=0; i
< k
+ m
; i
++) {
676 dm_row
[row_id
++] = i
;
679 for (int i
=0; i
< k
; i
++) {
681 dm_column
[column_id
++] = i
;
691 fprintf(stderr
, "shec_make_decoding_matrix(): can't find recover matrix.\n");
695 for (int i
= 0; i
< k
+ m
; i
++) {
699 for (int i
=0; i
< k
&& dm_row
[i
] != -1; i
++) {
700 minimum
[dm_row
[i
]] = 1;
703 for (int i
= 0; i
< k
; ++i
) {
704 if (want
[i
] && avails
[i
]) {
709 for (int i
= 0; i
< m
; ++i
) {
710 if (want
[k
+ i
] && avails
[k
+ i
] && !minimum
[k
+ i
]) {
711 for (int j
= 0; j
< k
; ++j
) {
712 if (matrix
[i
* k
+ j
] > 0 && !want
[j
]) {
724 int tmpmat
[mindup
* mindup
];
725 for (int i
=0; i
< mindup
; i
++) {
726 for (int j
=0; j
< mindup
; j
++) {
728 tmpmat
[i
* mindup
+ j
] = (dm_row
[i
] == dm_column
[j
] ? 1 : 0);
730 tmpmat
[i
* mindup
+ j
] = matrix
[(dm_row
[i
] - k
) * k
+ dm_column
[j
]];
734 for (int j
= 0; j
< mindup
; j
++) {
735 if (dm_row
[i
] == dm_column
[j
]) {
740 dm_row
[i
] -= (k
- mindup
);
748 int ret
= jerasure_invert_matrix(tmpmat
, decoding_matrix
, mindup
, w
);
750 tcache
.putDecodingTableToCache(decoding_matrix
, dm_row
, dm_column
, minimum
, technique
,
751 k
, m
, c
, w
, want
, avails
);
756 int ErasureCodeShec::shec_matrix_decode(int *want
, int *avails
, char **data_ptrs
,
757 char **coding_ptrs
, int size
)
759 int decoding_matrix
[k
*k
];
760 int dm_row
[k
], dm_column
[k
];
763 memset(decoding_matrix
, 0, sizeof(decoding_matrix
));
764 memset(dm_row
, -1, sizeof(dm_row
));
765 memset(dm_column
, -1, sizeof(dm_column
));
766 memset(minimum
, -1, sizeof(minimum
));
768 if (w
!= 8 && w
!= 16 && w
!= 32) return -1;
770 if (shec_make_decoding_matrix(false, want
, avails
, decoding_matrix
,
771 dm_row
, dm_column
, minimum
) < 0) {
775 // Get decoding matrix size
777 for (int i
= 0; i
< k
; i
++) {
778 if (dm_row
[i
] == -1) {
784 char *dm_data_ptrs
[dm_size
];
785 for (int i
= 0; i
< dm_size
; i
++) {
786 dm_data_ptrs
[i
] = data_ptrs
[dm_column
[i
]];
789 // Decode the data drives
790 for (int i
= 0; i
< dm_size
; i
++) {
791 if (!avails
[dm_column
[i
]]) {
792 jerasure_matrix_dotprod(dm_size
, w
, decoding_matrix
+ (i
* dm_size
),
793 dm_row
, i
, dm_data_ptrs
, coding_ptrs
, size
);
797 // Re-encode any erased coding devices
798 for (int i
= 0; i
< m
; i
++) {
799 if (want
[k
+i
] && !avails
[k
+i
]) {
800 jerasure_matrix_dotprod(k
, w
, matrix
+ (i
* k
), NULL
, i
+k
,
801 data_ptrs
, coding_ptrs
, size
);