fs/ceph/locks.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/ceph/ceph_debug.h>
   3
   4 #include <linux/file.h>
   5 #include <linux/namei.h>
   6 #include <linux/random.h>
   7
   8 #include "super.h"
   9 #include "mds_client.h"
  10 #include <linux/ceph/pagelist.h>
  11
  12 static u64 lock_secret;
  13 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
  14                                          struct ceph_mds_request *req);
  15
  16 static inline u64 secure_addr(void *addr)
  17 {
  18         u64 v = lock_secret ^ (u64)(unsigned long)addr;
  19         /*
  20          * Set the most significant bit, so that MDS knows the 'owner'
  21          * is sufficient to identify the owner of lock. (old code uses
  22          * both 'owner' and 'pid')
  23          */
  24         v |= (1ULL << 63);
  25         return v;
  26 }
  27
  28 void __init ceph_flock_init(void)
  29 {
  30         get_random_bytes(&lock_secret, sizeof(lock_secret));
  31 }
  32
  33 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
  34 {
  35         struct inode *inode = file_inode(src->fl_file);
  36         atomic_inc(&ceph_inode(inode)->i_filelock_ref);
  37 }
  38
  39 static void ceph_fl_release_lock(struct file_lock *fl)
  40 {
  41         struct inode *inode = file_inode(fl->fl_file);
  42         struct ceph_inode_info *ci = ceph_inode(inode);
  43         if (atomic_dec_and_test(&ci->i_filelock_ref)) {
  44                 /* clear error when all locks are released */
  45                 spin_lock(&ci->i_ceph_lock);
  46                 ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
  47                 spin_unlock(&ci->i_ceph_lock);
  48         }
  49 }
  50
  51 static const struct file_lock_operations ceph_fl_lock_ops = {
  52         .fl_copy_lock = ceph_fl_copy_lock,
  53         .fl_release_private = ceph_fl_release_lock,
  54 };
  55
  56 /**
  57  * Implement fcntl and flock locking functions.
  58  */
  59 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
  60                              int cmd, u8 wait, struct file_lock *fl)
  61 {
  62         struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  63         struct ceph_mds_request *req;
  64         int err;
  65         u64 length = 0;
  66         u64 owner;
  67
  68         if (operation == CEPH_MDS_OP_SETFILELOCK) {
  69                 /*
  70                  * increasing i_filelock_ref closes race window between
  71                  * handling request reply and adding file_lock struct to
  72                  * inode. Otherwise, auth caps may get trimmed in the
  73                  * window. Caller function will decrease the counter.
  74                  */
  75                 fl->fl_ops = &ceph_fl_lock_ops;
  76                 atomic_inc(&ceph_inode(inode)->i_filelock_ref);
  77         }
  78
  79         if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
  80                 wait = 0;
  81
  82         req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
  83         if (IS_ERR(req))
  84                 return PTR_ERR(req);
  85         req->r_inode = inode;
  86         ihold(inode);
  87         req->r_num_caps = 1;
  88
  89         /* mds requires start and length rather than start and end */
  90         if (LLONG_MAX == fl->fl_end)
  91                 length = 0;
  92         else
  93                 length = fl->fl_end - fl->fl_start + 1;
  94
  95         owner = secure_addr(fl->fl_owner);
  96
  97         dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
  98              "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
  99              (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
 100              wait, fl->fl_type);
 101
 102         req->r_args.filelock_change.rule = lock_type;
 103         req->r_args.filelock_change.type = cmd;
 104         req->r_args.filelock_change.owner = cpu_to_le64(owner);
 105         req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
 106         req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
 107         req->r_args.filelock_change.length = cpu_to_le64(length);
 108         req->r_args.filelock_change.wait = wait;
 109
 110         if (wait)
 111                 req->r_wait_for_completion = ceph_lock_wait_for_completion;
 112
 113         err = ceph_mdsc_do_request(mdsc, inode, req);
 114
 115         if (operation == CEPH_MDS_OP_GETFILELOCK) {
 116                 fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
 117                 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
 118                         fl->fl_type = F_RDLCK;
 119                 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
 120                         fl->fl_type = F_WRLCK;
 121                 else
 122                         fl->fl_type = F_UNLCK;
 123
 124                 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
 125                 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
 126                                                  le64_to_cpu(req->r_reply_info.filelock_reply->length);
 127                 if (length >= 1)
 128                         fl->fl_end = length -1;
 129                 else
 130                         fl->fl_end = 0;
 131
 132         }
 133         ceph_mdsc_put_request(req);
 134         dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
 135              "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
 136              (int)operation, (u64)fl->fl_pid, fl->fl_start,
 137              length, wait, fl->fl_type, err);
 138         return err;
 139 }
 140
 141 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
 142                                          struct ceph_mds_request *req)
 143 {
 144         struct ceph_mds_request *intr_req;
 145         struct inode *inode = req->r_inode;
 146         int err, lock_type;
 147
 148         BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
 149         if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
 150                 lock_type = CEPH_LOCK_FCNTL_INTR;
 151         else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
 152                 lock_type = CEPH_LOCK_FLOCK_INTR;
 153         else
 154                 BUG_ON(1);
 155         BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
 156
 157         err = wait_for_completion_interruptible(&req->r_completion);
 158         if (!err)
 159                 return 0;
 160
 161         dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
 162              req->r_tid);
 163
 164         mutex_lock(&mdsc->mutex);
 165         if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
 166                 err = 0;
 167         } else {
 168                 /*
 169                  * ensure we aren't running concurrently with
 170                  * ceph_fill_trace or ceph_readdir_prepopulate, which
 171                  * rely on locks (dir mutex) held by our caller.
 172                  */
 173                 mutex_lock(&req->r_fill_mutex);
 174                 req->r_err = err;
 175                 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
 176                 mutex_unlock(&req->r_fill_mutex);
 177
 178                 if (!req->r_session) {
 179                         // haven't sent the request
 180                         err = 0;
 181                 }
 182         }
 183         mutex_unlock(&mdsc->mutex);
 184         if (!err)
 185                 return 0;
 186
 187         intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
 188                                             USE_AUTH_MDS);
 189         if (IS_ERR(intr_req))
 190                 return PTR_ERR(intr_req);
 191
 192         intr_req->r_inode = inode;
 193         ihold(inode);
 194         intr_req->r_num_caps = 1;
 195
 196         intr_req->r_args.filelock_change = req->r_args.filelock_change;
 197         intr_req->r_args.filelock_change.rule = lock_type;
 198         intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
 199
 200         err = ceph_mdsc_do_request(mdsc, inode, intr_req);
 201         ceph_mdsc_put_request(intr_req);
 202
 203         if (err && err != -ERESTARTSYS)
 204                 return err;
 205
 206         wait_for_completion_killable(&req->r_safe_completion);
 207         return 0;
 208 }
 209
 210 /**
 211  * Attempt to set an fcntl lock.
 212  * For now, this just goes away to the server. Later it may be more awesome.
 213  */
 214 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 215 {
 216         struct inode *inode = file_inode(file);
 217         struct ceph_inode_info *ci = ceph_inode(inode);
 218         int err = 0;
 219         u16 op = CEPH_MDS_OP_SETFILELOCK;
 220         u8 wait = 0;
 221         u8 lock_cmd;
 222
 223         if (!(fl->fl_flags & FL_POSIX))
 224                 return -ENOLCK;
 225         /* No mandatory locks */
 226         if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
 227                 return -ENOLCK;
 228
 229         dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
 230
 231         /* set wait bit as appropriate, then make command as Ceph expects it*/
 232         if (IS_GETLK(cmd))
 233                 op = CEPH_MDS_OP_GETFILELOCK;
 234         else if (IS_SETLKW(cmd))
 235                 wait = 1;
 236
 237         spin_lock(&ci->i_ceph_lock);
 238         if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
 239                 err = -EIO;
 240         }
 241         spin_unlock(&ci->i_ceph_lock);
 242         if (err < 0) {
 243                 if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
 244                         posix_lock_file(file, fl, NULL);
 245                 return err;
 246         }
 247
 248         if (F_RDLCK == fl->fl_type)
 249                 lock_cmd = CEPH_LOCK_SHARED;
 250         else if (F_WRLCK == fl->fl_type)
 251                 lock_cmd = CEPH_LOCK_EXCL;
 252         else
 253                 lock_cmd = CEPH_LOCK_UNLOCK;
 254
 255         err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
 256         if (!err) {
 257                 if (op == CEPH_MDS_OP_SETFILELOCK) {
 258                         dout("mds locked, locking locally\n");
 259                         err = posix_lock_file(file, fl, NULL);
 260                         if (err) {
 261                                 /* undo! This should only happen if
 262                                  * the kernel detects local
 263                                  * deadlock. */
 264                                 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
 265                                                   CEPH_LOCK_UNLOCK, 0, fl);
 266                                 dout("got %d on posix_lock_file, undid lock\n",
 267                                      err);
 268                         }
 269                 }
 270         }
 271         return err;
 272 }
 273
 274 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 275 {
 276         struct inode *inode = file_inode(file);
 277         struct ceph_inode_info *ci = ceph_inode(inode);
 278         int err = 0;
 279         u8 wait = 0;
 280         u8 lock_cmd;
 281
 282         if (!(fl->fl_flags & FL_FLOCK))
 283                 return -ENOLCK;
 284         /* No mandatory locks */
 285         if (fl->fl_type & LOCK_MAND)
 286                 return -EOPNOTSUPP;
 287
 288         dout("ceph_flock, fl_file: %p\n", fl->fl_file);
 289
 290         spin_lock(&ci->i_ceph_lock);
 291         if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
 292                 err = -EIO;
 293         }
 294         spin_unlock(&ci->i_ceph_lock);
 295         if (err < 0) {
 296                 if (F_UNLCK == fl->fl_type)
 297                         locks_lock_file_wait(file, fl);
 298                 return err;
 299         }
 300
 301         if (IS_SETLKW(cmd))
 302                 wait = 1;
 303
 304         if (F_RDLCK == fl->fl_type)
 305                 lock_cmd = CEPH_LOCK_SHARED;
 306         else if (F_WRLCK == fl->fl_type)
 307                 lock_cmd = CEPH_LOCK_EXCL;
 308         else
 309                 lock_cmd = CEPH_LOCK_UNLOCK;
 310
 311         err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
 312                                 inode, lock_cmd, wait, fl);
 313         if (!err) {
 314                 err = locks_lock_file_wait(file, fl);
 315                 if (err) {
 316                         ceph_lock_message(CEPH_LOCK_FLOCK,
 317                                           CEPH_MDS_OP_SETFILELOCK,
 318                                           inode, CEPH_LOCK_UNLOCK, 0, fl);
 319                         dout("got %d on locks_lock_file_wait, undid lock\n", err);
 320                 }
 321         }
 322         return err;
 323 }
 324
 325 /*
 326  * Fills in the passed counter variables, so you can prepare pagelist metadata
 327  * before calling ceph_encode_locks.
 328  */
 329 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 330 {
 331         struct file_lock *lock;
 332         struct file_lock_context *ctx;
 333
 334         *fcntl_count = 0;
 335         *flock_count = 0;
 336
 337         ctx = inode->i_flctx;
 338         if (ctx) {
 339                 spin_lock(&ctx->flc_lock);
 340                 list_for_each_entry(lock, &ctx->flc_posix, fl_list)
 341                         ++(*fcntl_count);
 342                 list_for_each_entry(lock, &ctx->flc_flock, fl_list)
 343                         ++(*flock_count);
 344                 spin_unlock(&ctx->flc_lock);
 345         }
 346         dout("counted %d flock locks and %d fcntl locks\n",
 347              *flock_count, *fcntl_count);
 348 }
 349
 350 /*
 351  * Given a pointer to a lock, convert it to a ceph filelock
 352  */
 353 static int lock_to_ceph_filelock(struct file_lock *lock,
 354                                  struct ceph_filelock *cephlock)
 355 {
 356         int err = 0;
 357         cephlock->start = cpu_to_le64(lock->fl_start);
 358         cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
 359         cephlock->client = cpu_to_le64(0);
 360         cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
 361         cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
 362
 363         switch (lock->fl_type) {
 364         case F_RDLCK:
 365                 cephlock->type = CEPH_LOCK_SHARED;
 366                 break;
 367         case F_WRLCK:
 368                 cephlock->type = CEPH_LOCK_EXCL;
 369                 break;
 370         case F_UNLCK:
 371                 cephlock->type = CEPH_LOCK_UNLOCK;
 372                 break;
 373         default:
 374                 dout("Have unknown lock type %d\n", lock->fl_type);
 375                 err = -EINVAL;
 376         }
 377
 378         return err;
 379 }
 380
 381 /**
 382  * Encode the flock and fcntl locks for the given inode into the ceph_filelock
 383  * array. Must be called with inode->i_lock already held.
 384  * If we encounter more of a specific lock type than expected, return -ENOSPC.
 385  */
 386 int ceph_encode_locks_to_buffer(struct inode *inode,
 387                                 struct ceph_filelock *flocks,
 388                                 int num_fcntl_locks, int num_flock_locks)
 389 {
 390         struct file_lock *lock;
 391         struct file_lock_context *ctx = inode->i_flctx;
 392         int err = 0;
 393         int seen_fcntl = 0;
 394         int seen_flock = 0;
 395         int l = 0;
 396
 397         dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
 398              num_fcntl_locks);
 399
 400         if (!ctx)
 401                 return 0;
 402
 403         spin_lock(&ctx->flc_lock);
 404         list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
 405                 ++seen_fcntl;
 406                 if (seen_fcntl > num_fcntl_locks) {
 407                         err = -ENOSPC;
 408                         goto fail;
 409                 }
 410                 err = lock_to_ceph_filelock(lock, &flocks[l]);
 411                 if (err)
 412                         goto fail;
 413                 ++l;
 414         }
 415         list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
 416                 ++seen_flock;
 417                 if (seen_flock > num_flock_locks) {
 418                         err = -ENOSPC;
 419                         goto fail;
 420                 }
 421                 err = lock_to_ceph_filelock(lock, &flocks[l]);
 422                 if (err)
 423                         goto fail;
 424                 ++l;
 425         }
 426 fail:
 427         spin_unlock(&ctx->flc_lock);
 428         return err;
 429 }
 430
 431 /**
 432  * Copy the encoded flock and fcntl locks into the pagelist.
 433  * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
 434  * sequential flock locks.
 435  * Returns zero on success.
 436  */
 437 int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
 438                            struct ceph_pagelist *pagelist,
 439                            int num_fcntl_locks, int num_flock_locks)
 440 {
 441         int err = 0;
 442         __le32 nlocks;
 443
 444         nlocks = cpu_to_le32(num_fcntl_locks);
 445         err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
 446         if (err)
 447                 goto out_fail;
 448
 449         if (num_fcntl_locks > 0) {
 450                 err = ceph_pagelist_append(pagelist, flocks,
 451                                            num_fcntl_locks * sizeof(*flocks));
 452                 if (err)
 453                         goto out_fail;
 454         }
 455
 456         nlocks = cpu_to_le32(num_flock_locks);
 457         err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
 458         if (err)
 459                 goto out_fail;
 460
 461         if (num_flock_locks > 0) {
 462                 err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
 463                                            num_flock_locks * sizeof(*flocks));
 464         }
 465 out_fail:
 466         return err;
 467 }