]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | #!/bin/bash |
2 | # https://tracker.ceph.com/issues/47839 | |
3 | # Signed-off-by: Chris Dunlop <chris@onthe.net.au> | |
4 | ||
5 | ||
6 | ###################################################################### | |
7 | function usage | |
8 | { | |
9 | cat <<END | |
10 | Usage: $0 osd device | |
11 | ||
12 | Description: | |
13 | ||
14 | Migrate an OSD from Filestore to BlueStore | |
15 | ||
16 | Where: | |
17 | ||
18 | osd - OSD ID to migrate | |
19 | device - raw device to migrate to, starting with /dev/disk/by-id/ | |
20 | ||
21 | E.g.: | |
22 | ||
23 | ceph-migrate-bluestore 6 /dev/disk/by-id/ata-WDC_WD80EFZX-68UW8N0_VK0RKXTY | |
24 | ||
25 | END | |
26 | exit 0 | |
27 | } | |
28 | ###################################################################### | |
29 | ||
30 | shopt -s -o errexit nounset pipefail | |
31 | shopt -s extglob failglob inherit_errexit lastpipe | |
32 | ||
33 | [[ $# -eq 2 ]] || usage | |
34 | osd=$1 | |
35 | bluestore_device=$2 | |
36 | ||
37 | [[ $osd =~ ^[0-9]+$ ]] || error 'osd must be numeric' | |
38 | [[ | |
39 | -b $bluestore_device && | |
40 | $bluestore_device =~ ^/dev/disk/by-id/ && | |
41 | ! $bluestore_device =~ -part[0-9]+$ | |
42 | ]] || error "device must be a raw block device starting with /dev/disk/by-id/" | |
43 | ||
44 | ###################################################################### | |
45 | # Setup... | |
46 | # | |
47 | ||
48 | # | |
49 | # VG used for block.db LVs | |
50 | # | |
51 | vgdb='vg-861d7200-578c-45c2-a44c-2f0c56427bf1' | |
52 | vgs "${vgdb}" >& /dev/null || error "VG '${vgdb}' for block.db not found" | |
53 | ||
54 | # | |
55 | # Size of LV in $vgdb for the block.db | |
56 | # | |
57 | dblvsize=60G | |
58 | ||
59 | # | |
60 | # Prefix used for block LVs | |
61 | # | |
62 | block_prefix='osd-block' | |
63 | ||
64 | # | |
65 | # Some less(?) common we use - abort early if they're missing | |
66 | # | |
67 | cmds=( | |
68 | bc | |
69 | sgdisk | |
70 | ) | |
71 | ||
72 | ###################################################################### | |
73 | # Functions... | |
74 | # | |
75 | function runcmd | |
76 | { | |
77 | local IFS=' ' | |
78 | echo 1>&2 "$*" | |
79 | "$@" | |
80 | } | |
81 | ||
82 | function is_uuid | |
83 | { | |
84 | [[ $1 =~ ^[[:xdigit:]]{8}-[[:xdigit:]]{4}-[[:xdigit:]]{4}-[[:xdigit:]]{4}-[[:xdigit:]]{12}$ ]] | |
85 | } | |
86 | ||
87 | # | |
88 | # Compare the used size of the OSD with the new device | |
89 | # (and arbitrarily 20% larger 'cos we don't want to fill it up) | |
90 | # | |
91 | function check-device-size | |
92 | { | |
93 | # "ceph osd df" fields 7 and 8 - "RAW USE", size and units | |
94 | IFS=' ' read -r sz units <<< "$(ceph osd df | awk -v"id=${osd}" '$1==id { print $7, $8 }')" | |
95 | case $units in | |
96 | KiB) pow=1 ;; | |
97 | MiB) pow=2 ;; | |
98 | GiB) pow=3 ;; | |
99 | TiB) pow=4 ;; | |
100 | PiB) pow=5 ;; | |
101 | *) error "ceph df: units not recognized: ${units}" ;; | |
102 | esac | |
103 | osdbytes=$(printf '%.0f' "$(bc <<< "${sz} * 1024^${pow} * 1.2")") | |
104 | ||
105 | bdev=$(realpath "${bluestore_device}") | |
106 | bdev=${bdev##*/} | |
107 | [[ -e /sys/block/${bdev##*/}/size ]] || error "Can't find size for ${bluestore_device}" | |
108 | bdevbytes=$(($(<"/sys/block/${bdev##*/}/size") * 512)) | |
109 | ||
110 | declare -p osdbytes bdevbytes | |
111 | ||
112 | ((bdevbytes >= osdbytes)) || error "The block device isn't large enough" | |
113 | } | |
114 | ||
115 | # | |
116 | # Check things look ok | |
117 | # | |
118 | # Is there a better way of checking, other than manually? | |
119 | # | |
120 | function check-ceph-ok | |
121 | { | |
122 | local ans=r | |
123 | ||
124 | while [[ $ans = r ]] | |
125 | do | |
126 | runcmd ceph -s | |
127 | read -r -p $'\nCheck status above and press r to recheck or <Enter> to continue with scrub' ans | |
128 | done | |
129 | ||
130 | # | |
131 | # Run a scrub "to be sure, to be sure" | |
132 | # | |
133 | # For smaller OSDs we can see which PGs we need to watch for... | |
134 | # | |
135 | runcmd ceph pg ls-by-primary "${osd}" | awk '$1~/^[0-9]+\./ { print $1 }' | |
136 | runcmd ceph osd scrub "${osd}" | |
137 | ||
138 | hr | |
139 | tail -n0 -f "/var/log/ceph/ceph-osd.${osd}.log" & | |
140 | pid=$! | |
141 | sleep 2 | |
142 | while ! read -r -t 10 -p $'\n\n\ntailing osd log file: press <Enter> to continue\n\n\n' ans | |
143 | do | |
144 | : | |
145 | done | |
146 | kill "${pid}" | |
147 | hr | |
148 | ||
149 | ans=r | |
150 | while [[ $ans = r ]] | |
151 | do | |
152 | runcmd ceph -s | |
153 | read -r -p $'\nCheck status above and press r to recheck or <Enter> to continue' ans | |
154 | done | |
155 | } | |
156 | ||
157 | # | |
158 | # Disable the FileStore so it doesn't attempt to come back on reboot, but | |
159 | # so we can revert back to it if necessary | |
160 | # | |
161 | # https://en.wikipedia.org/wiki/GUID_Partition_Table#Partition_type_GUIDs | |
162 | # Partition GUID code: 4FBD7E29-9D25-41B8-AFD0-062C0CEFF05D (Ceph OSD) | |
163 | # Partition GUID code: 0FC63DAF-8483-4772-8E79-3D69D8477DE4 (Linux filesystem data) | |
164 | # | |
165 | function disable-filestore | |
166 | { | |
167 | # | |
168 | # Remove the original device from fstab if it's there | |
169 | # (it may be in here for xfs with logdev etc.) | |
170 | # | |
171 | if grep -qE '^[^#[:space:]]+[[:space:]]+'"${osddir}"'[[:space:]]' /etc/fstab | |
172 | then | |
173 | [[ -e /etc/fstab.${0##*/} ]] || cp -a /etc/fstab{,."${0##*/}"} | |
174 | sed -ri '/^[^#[:space:]]+[[:space:]]+'"${osddir//\//\\\/}"'[[:space:]]/ s/^/# /' /etc/fstab | |
175 | fi | |
176 | ||
177 | # | |
178 | # Change the partition type | |
179 | # | |
180 | [[ -e ${osd_json%.json}.part ]] || | |
181 | runcmd sgdisk --backup="${osd_json%.json}.part" "${filestore_device}" | |
182 | part_guid=$(sgdisk -i1 "${filestore_device}" | sed -rn 's/^Partition GUID code: ([[:xdigit:]-]+) .*/\1/p') | |
183 | if [[ $part_guid = 4FBD7E29-9D25-41B8-AFD0-062C0CEFF05D ]] | |
184 | then | |
185 | runcmd sgdisk --typecode=1:0FC63DAF-8483-4772-8E79-3D69D8477DE4 "${filestore_device}" | |
186 | echo "${filestore_device} partition 1 changed to type 0FC63DAF-8483-4772-8E79-3D69D8477DE4 (Linux filesystem data)" | |
187 | fi | |
188 | } | |
189 | ||
190 | ###################################################################### | |
191 | # Processing... | |
192 | # | |
193 | ||
194 | # | |
195 | # Check we have the commands we need | |
196 | # | |
197 | for cmd in "${cmds[@]}" | |
198 | do | |
199 | type "${cmd}" >& /dev/null || error "${cmd} utility required" | |
200 | done | |
201 | ||
202 | # | |
203 | # Get/check OSD | |
204 | # | |
205 | unit=ceph-osd@${osd} | |
206 | runcmd systemctl is-enabled "${unit}" || | |
207 | error "systemd unit ${unit} not enabled" | |
208 | ||
209 | osddir=/var/lib/ceph/osd/ceph-${osd} | |
210 | [[ -d $osddir ]] || error "No directory: ${osddir}" | |
211 | ||
212 | fsid=$(< "${osddir}/fsid") | |
213 | is_uuid "${fsid}" || error "fsid uuid not found in ${osddir}/fsid" | |
214 | ||
215 | osd_json=/etc/ceph/osd/${osd}-${fsid}.json | |
216 | [[ -f $osd_json ]] || error "File doesn't exist: ${osd_json}" | |
217 | ||
218 | lvnewdb=${vgdb}/osd-db-${fsid} | |
219 | authkey=$(sed -rn 's/^[[:space:]]+key[[:space:]]*=[[:space:]]*//p' "${osddir}/keyring") | |
220 | [[ $authkey ]] || error "Can't get authkey from ${osddir}/keyring" | |
221 | ||
222 | # | |
223 | # We want the device containing the FileStore version of the OSD | |
224 | # so we can disable it once the BlueStore version is up and running, | |
225 | # so the FileStore doesn't contend with the BlueStore on reboot etc. | |
226 | # | |
227 | filestore_device=$(awk '$2=="'"${osddir}"'" { print $1; }' /etc/mtab) | |
228 | [[ $filestore_device ]] || error "Can't find device currently mounted on ${osddir}" | |
229 | [[ $filestore_device =~ ^/dev/sd[a-z]+[0-9]*$ ]] || error "Don't recognize device currently mounted on ${osddir}: ${filestore_device}" | |
230 | filestore_device=${filestore_device%%+([0-9])} | |
231 | ||
232 | declare -p unit block_prefix bluestore_device osd osddir fsid osd_json lvnewdb authkey filestore_device | |
233 | ||
234 | runcmd check-device-size | |
235 | ||
236 | # | |
237 | # Create raw LV for block.db | |
238 | # | |
239 | runcmd lvcreate --yes -L "${dblvsize}" -n "${lvnewdb#*/}" "${lvnewdb%/*}" | |
240 | ||
241 | ||
242 | # | |
243 | # Prepare the new OSD | |
244 | # osd-list.orig is so we can work out which osd was created | |
245 | # | |
246 | ceph osd ls > /tmp/osd-list.orig | |
247 | runcmd ceph-volume lvm prepare --data "${bluestore_device}" --block.db "${lvnewdb}" | |
248 | ||
249 | # | |
250 | # Work out which OSD has been created | |
251 | # Is there a better way of doing this? | |
252 | # | |
253 | ceph osd ls > /tmp/osd-list.new | |
254 | ||
255 | new=$(comm -13 /tmp/osd-list.{orig,new}) | |
256 | [[ $new =~ ^[0-9]+$ ]] || error "New OSD id not found" | |
257 | ||
258 | # | |
259 | # remove the new OSD from the ceph database | |
260 | # (it's left mounted) | |
261 | # | |
262 | runcmd ceph osd purge "${new}" --yes-i-really-mean-it | |
263 | ||
264 | # | |
265 | # Params for the newly created OSD | |
266 | # | |
267 | newdir=/var/lib/ceph/osd/ceph-${new} | |
268 | lvnew=$(readlink "${newdir}/block"); lvnew=${lvnew#/dev/} | |
269 | ||
270 | # | |
271 | # lvfix is what we're going to rename the LV to so | |
272 | # it ends in the (original) fsid | |
273 | # | |
274 | is_uuid "${lvnew#*/${block_prefix}-}" || error "LV not recognised: ${lvnew}" | |
275 | lvfix=${lvnew%%/*}/${block_prefix}-${fsid} | |
276 | ||
277 | declare -p new newdir lvnew lvfix | |
278 | ||
279 | # | |
280 | # the "dup" step only works if the destination has the same id and fsid | |
281 | # as the source: fix 'em up | |
282 | # | |
283 | new_fsid=$(< "${newdir}/fsid") | |
284 | args=( | |
285 | --deltag "ceph.osd_id=${new}" | |
286 | --addtag "ceph.osd_id=${osd}" | |
287 | ||
288 | --deltag "ceph.osd_fsid=${new_fsid}" | |
289 | --addtag "ceph.osd_fsid=${fsid}" | |
290 | ||
291 | --deltag "ceph.block_device=${lvnew}" | |
292 | --addtag "ceph.block_device=${lvfix}" | |
293 | ) | |
294 | runcmd lvchange "${args[@]}" "${lvnew}" | |
295 | runcmd lvchange "${args[@]}" "${lvnewdb}" | |
296 | ||
297 | runcmd ceph-bluestore-tool set-label-key --dev "${newdir}/block" --key whoami --value "${osd}" | |
298 | runcmd ceph-bluestore-tool set-label-key --dev "${newdir}/block" --key osd_uuid --value "${fsid}" | |
299 | runcmd ceph-bluestore-tool set-label-key --dev "${newdir}/block.db" --key osd_uuid --value "${fsid}" | |
300 | ||
301 | echo "${fsid}" > "${newdir}/fsid" | |
302 | ||
303 | # | |
304 | # Rename the LV so it ends in the (original) fsid | |
305 | # | |
306 | runcmd lvrename "${lvnew}" "${lvfix}" | |
307 | runcmd ln -sf "/dev/${lvfix}" "${newdir}/block" | |
308 | lvnew=$lvfix | |
309 | ||
310 | # | |
311 | # Remove the flags that mkfs has already been done - otherwise mkfs skips the actual mkfs! | |
312 | # | |
313 | runcmd ceph-bluestore-tool rm-label-key --dev "${newdir}/block" --key mkfs_done | |
314 | runcmd rm "${newdir}/mkfs_done" | |
315 | ||
316 | # | |
317 | # Empty out the new OSD filesystem | |
318 | # | |
319 | runcmd ceph-objectstore-tool --type bluestore --data-path "${newdir}" --fsid "${fsid}" --op mkfs --no-mon-config | |
320 | ||
321 | # | |
322 | # Stop the osd - the copy can't proceed if it's busy | |
323 | # | |
324 | runcmd systemctl is-active --quiet "ceph-osd@${osd}" && | |
325 | runcmd systemctl stop "ceph-osd@${osd}" | |
326 | ||
327 | # | |
328 | # The actual copy... | |
329 | # | |
330 | runcmd time ceph-objectstore-tool --type filestore --data-path "/var/lib/ceph/osd/ceph-${osd}" --target-data-path "${newdir}" --op dup | |
331 | ||
332 | # | |
333 | # Fix up some keys from the copy | |
334 | # | |
335 | printf '[osd.%d]\n\tkey = %s\n' "${osd}" "${authkey}" > "${newdir}/key" | |
336 | ceph-bluestore-tool set-label-key --dev "${newdir}/block" --key osd_key --value "${authkey}" | |
337 | ceph-bluestore-tool rm-label-key --dev "${newdir}/block" --key fsid | |
338 | ||
339 | # | |
340 | # Move the FileStore config file out of the way to avoid it being used on boot | |
341 | # | |
342 | runcmd mv "${osd_json}"{,.orig} | |
343 | ||
344 | # | |
345 | # prepare the mount points | |
346 | # | |
347 | runcmd umount "${osddir}" | |
348 | runcmd umount "${newdir}" | |
349 | runcmd rmdir "${newdir}" | |
350 | ||
351 | # | |
352 | # Start the new BlueStore version of the OSD | |
353 | # | |
354 | runcmd ceph-volume lvm trigger "${osd}-${fsid}" | |
355 | ||
356 | # | |
357 | # Let things settle a little then check the new OSD is running | |
358 | # | |
359 | sleep 5 | |
360 | if ! systemctl is-active --quiet "${unit}" | |
361 | then | |
362 | systemctl status "${unit}" | |
363 | exit 1 | |
364 | fi | |
365 | ||
366 | runcmd check-ceph-ok | |
367 | ||
368 | runcmd disable-filestore | |
369 | ||
370 | exit 0 |