1 // SPDX-License-Identifier: GPL-2.0
5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 only,
9 * as published by the Free Software Foundation.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License version 2 for more details (a copy is included
15 * in the LICENSE file that accompanied this code).
17 * You should have received a copy of the GNU General Public License
18 * version 2 along with this program; If not, see
19 * http://www.gnu.org/licenses/gpl-2.0.html
24 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Use is subject to license terms.
27 * Copyright (c) 2011, 2015, Intel Corporation.
30 * This file is part of Lustre, http://www.lustre.org/
31 * Lustre is a trademark of Sun Microsystems, Inc.
35 * Author: Peter Braam <braam@clusterfs.com>
36 * Author: Phil Schwan <phil@clusterfs.com>
37 * Author: Andreas Dilger <adilger@clusterfs.com>
40 #define DEBUG_SUBSYSTEM S_LLITE
41 #include <lustre_dlm.h>
42 #include <linux/pagemap.h>
43 #include <linux/file.h>
44 #include <linux/sched.h>
45 #include <linux/mount.h>
46 #include <uapi/linux/lustre/lustre_fiemap.h>
47 #include <uapi/linux/lustre/lustre_ioctl.h>
48 #include <lustre_swab.h>
50 #include <cl_object.h>
51 #include "llite_internal.h"
54 ll_put_grouplock(struct inode
*inode
, struct file
*file
, unsigned long arg
);
56 static int ll_lease_close(struct obd_client_handle
*och
, struct inode
*inode
,
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode
*inode
, struct file
*file
,
61 unsigned int cmd
, unsigned long arg
, int *rcp
);
63 static struct ll_file_data
*ll_file_data_get(void)
65 struct ll_file_data
*fd
;
67 fd
= kmem_cache_zalloc(ll_file_data_slab
, GFP_NOFS
);
70 fd
->fd_write_failed
= false;
74 static void ll_file_data_put(struct ll_file_data
*fd
)
77 kmem_cache_free(ll_file_data_slab
, fd
);
81 * Packs all the attributes into @op_data for the CLOSE rpc.
83 static void ll_prepare_close(struct inode
*inode
, struct md_op_data
*op_data
,
84 struct obd_client_handle
*och
)
86 struct ll_inode_info
*lli
= ll_i2info(inode
);
88 ll_prep_md_op_data(op_data
, inode
, NULL
, NULL
,
89 0, 0, LUSTRE_OPC_ANY
, NULL
);
91 op_data
->op_attr
.ia_mode
= inode
->i_mode
;
92 op_data
->op_attr
.ia_atime
= inode
->i_atime
;
93 op_data
->op_attr
.ia_mtime
= inode
->i_mtime
;
94 op_data
->op_attr
.ia_ctime
= inode
->i_ctime
;
95 op_data
->op_attr
.ia_size
= i_size_read(inode
);
96 op_data
->op_attr
.ia_valid
|= ATTR_MODE
| ATTR_ATIME
| ATTR_ATIME_SET
|
97 ATTR_MTIME
| ATTR_MTIME_SET
|
98 ATTR_CTIME
| ATTR_CTIME_SET
;
99 op_data
->op_attr_blocks
= inode
->i_blocks
;
100 op_data
->op_attr_flags
= ll_inode_to_ext_flags(inode
->i_flags
);
101 op_data
->op_handle
= och
->och_fh
;
104 * For HSM: if inode data has been modified, pack it so that
105 * MDT can set data dirty flag in the archive.
107 if (och
->och_flags
& FMODE_WRITE
&&
108 test_and_clear_bit(LLIF_DATA_MODIFIED
, &lli
->lli_flags
))
109 op_data
->op_bias
|= MDS_DATA_MODIFIED
;
113 * Perform a close, possibly with a bias.
114 * The meaning of "data" depends on the value of "bias".
116 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
117 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
120 static int ll_close_inode_openhandle(struct inode
*inode
,
121 struct obd_client_handle
*och
,
122 enum mds_op_bias bias
,
125 const struct ll_inode_info
*lli
= ll_i2info(inode
);
126 struct obd_export
*md_exp
= ll_i2mdexp(inode
);
127 struct md_op_data
*op_data
;
128 struct ptlrpc_request
*req
= NULL
;
131 if (!class_exp2obd(md_exp
)) {
132 CERROR("%s: invalid MDC connection handle closing " DFID
"\n",
133 ll_get_fsname(inode
->i_sb
, NULL
, 0),
134 PFID(&lli
->lli_fid
));
139 op_data
= kzalloc(sizeof(*op_data
), GFP_NOFS
);
141 * We leak openhandle and request here on error, but not much to be
142 * done in OOM case since app won't retry close on error either.
149 ll_prepare_close(inode
, op_data
, och
);
151 case MDS_CLOSE_LAYOUT_SWAP
:
153 op_data
->op_bias
|= MDS_CLOSE_LAYOUT_SWAP
;
154 op_data
->op_data_version
= 0;
155 op_data
->op_lease_handle
= och
->och_lease_handle
;
156 op_data
->op_fid2
= *ll_inode2fid(data
);
159 case MDS_HSM_RELEASE
:
161 op_data
->op_bias
|= MDS_HSM_RELEASE
;
162 op_data
->op_data_version
= *(__u64
*)data
;
163 op_data
->op_lease_handle
= och
->och_lease_handle
;
164 op_data
->op_attr
.ia_valid
|= ATTR_SIZE
| ATTR_BLOCKS
;
172 rc
= md_close(md_exp
, op_data
, och
->och_mod
, &req
);
173 if (rc
&& rc
!= -EINTR
) {
174 CERROR("%s: inode " DFID
" mdc close failed: rc = %d\n",
175 md_exp
->exp_obd
->obd_name
, PFID(&lli
->lli_fid
), rc
);
178 if (op_data
->op_bias
& (MDS_HSM_RELEASE
| MDS_CLOSE_LAYOUT_SWAP
) &&
180 struct mdt_body
*body
;
182 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
183 if (!(body
->mbo_valid
& OBD_MD_CLOSE_INTENT_EXECED
))
187 ll_finish_md_op_data(op_data
);
190 md_clear_open_replay_data(md_exp
, och
);
191 och
->och_fh
.cookie
= DEAD_HANDLE_MAGIC
;
194 ptlrpc_req_finished(req
);
198 int ll_md_real_close(struct inode
*inode
, fmode_t fmode
)
200 struct ll_inode_info
*lli
= ll_i2info(inode
);
201 struct obd_client_handle
**och_p
;
202 struct obd_client_handle
*och
;
206 if (fmode
& FMODE_WRITE
) {
207 och_p
= &lli
->lli_mds_write_och
;
208 och_usecount
= &lli
->lli_open_fd_write_count
;
209 } else if (fmode
& FMODE_EXEC
) {
210 och_p
= &lli
->lli_mds_exec_och
;
211 och_usecount
= &lli
->lli_open_fd_exec_count
;
213 LASSERT(fmode
& FMODE_READ
);
214 och_p
= &lli
->lli_mds_read_och
;
215 och_usecount
= &lli
->lli_open_fd_read_count
;
218 mutex_lock(&lli
->lli_och_mutex
);
219 if (*och_usecount
> 0) {
220 /* There are still users of this handle, so skip
223 mutex_unlock(&lli
->lli_och_mutex
);
229 mutex_unlock(&lli
->lli_och_mutex
);
232 /* There might be a race and this handle may already
235 rc
= ll_close_inode_openhandle(inode
, och
, 0, NULL
);
241 static int ll_md_close(struct inode
*inode
, struct file
*file
)
243 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
244 struct ll_inode_info
*lli
= ll_i2info(inode
);
246 __u64 flags
= LDLM_FL_BLOCK_GRANTED
| LDLM_FL_TEST_LOCK
;
247 struct lustre_handle lockh
;
248 union ldlm_policy_data policy
= {
249 .l_inodebits
= { MDS_INODELOCK_OPEN
}
253 /* clear group lock, if present */
254 if (unlikely(fd
->fd_flags
& LL_FILE_GROUP_LOCKED
))
255 ll_put_grouplock(inode
, file
, fd
->fd_grouplock
.lg_gid
);
257 if (fd
->fd_lease_och
) {
260 /* Usually the lease is not released when the
261 * application crashed, we need to release here.
263 rc
= ll_lease_close(fd
->fd_lease_och
, inode
, &lease_broken
);
264 CDEBUG(rc
? D_ERROR
: D_INODE
,
265 "Clean up lease " DFID
" %d/%d\n",
266 PFID(&lli
->lli_fid
), rc
, lease_broken
);
268 fd
->fd_lease_och
= NULL
;
272 rc
= ll_close_inode_openhandle(inode
, fd
->fd_och
, 0, NULL
);
277 /* Let's see if we have good enough OPEN lock on the file and if
278 * we can skip talking to MDS
281 mutex_lock(&lli
->lli_och_mutex
);
282 if (fd
->fd_omode
& FMODE_WRITE
) {
284 LASSERT(lli
->lli_open_fd_write_count
);
285 lli
->lli_open_fd_write_count
--;
286 } else if (fd
->fd_omode
& FMODE_EXEC
) {
288 LASSERT(lli
->lli_open_fd_exec_count
);
289 lli
->lli_open_fd_exec_count
--;
292 LASSERT(lli
->lli_open_fd_read_count
);
293 lli
->lli_open_fd_read_count
--;
295 mutex_unlock(&lli
->lli_och_mutex
);
297 if (!md_lock_match(ll_i2mdexp(inode
), flags
, ll_inode2fid(inode
),
298 LDLM_IBITS
, &policy
, lockmode
, &lockh
))
299 rc
= ll_md_real_close(inode
, fd
->fd_omode
);
302 LUSTRE_FPRIVATE(file
) = NULL
;
303 ll_file_data_put(fd
);
308 /* While this returns an error code, fput() the caller does not, so we need
309 * to make every effort to clean up all of our state here. Also, applications
310 * rarely check close errors and even if an error is returned they will not
311 * re-try the close call.
313 int ll_file_release(struct inode
*inode
, struct file
*file
)
315 struct ll_file_data
*fd
;
316 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
317 struct ll_inode_info
*lli
= ll_i2info(inode
);
320 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p)\n",
321 PFID(ll_inode2fid(inode
)), inode
);
323 if (!is_root_inode(inode
))
324 ll_stats_ops_tally(sbi
, LPROC_LL_RELEASE
, 1);
325 fd
= LUSTRE_FPRIVATE(file
);
328 /* The last ref on @file, maybe not be the owner pid of statahead,
329 * because parent and child process can share the same file handle.
331 if (S_ISDIR(inode
->i_mode
) && lli
->lli_opendir_key
== fd
)
332 ll_deauthorize_statahead(inode
, fd
);
334 if (is_root_inode(inode
)) {
335 LUSTRE_FPRIVATE(file
) = NULL
;
336 ll_file_data_put(fd
);
340 if (!S_ISDIR(inode
->i_mode
)) {
342 lov_read_and_clear_async_rc(lli
->lli_clob
);
343 lli
->lli_async_rc
= 0;
346 rc
= ll_md_close(inode
, file
);
348 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG
, cfs_fail_val
))
349 libcfs_debug_dumplog();
354 static int ll_intent_file_open(struct dentry
*de
, void *lmm
, int lmmsize
,
355 struct lookup_intent
*itp
)
357 struct inode
*inode
= d_inode(de
);
358 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
359 struct dentry
*parent
= de
->d_parent
;
360 const char *name
= NULL
;
361 struct md_op_data
*op_data
;
362 struct ptlrpc_request
*req
= NULL
;
366 LASSERT(itp
->it_flags
& MDS_OPEN_BY_FID
);
369 * if server supports open-by-fid, or file name is invalid, don't pack
370 * name in open request
372 if (!(exp_connect_flags(sbi
->ll_md_exp
) & OBD_CONNECT_OPEN_BY_FID
) &&
373 lu_name_is_valid_2(de
->d_name
.name
, de
->d_name
.len
)) {
374 name
= de
->d_name
.name
;
375 len
= de
->d_name
.len
;
378 op_data
= ll_prep_md_op_data(NULL
, d_inode(parent
), inode
, name
, len
,
379 O_RDWR
, LUSTRE_OPC_ANY
, NULL
);
381 return PTR_ERR(op_data
);
382 op_data
->op_data
= lmm
;
383 op_data
->op_data_size
= lmmsize
;
385 rc
= md_intent_lock(sbi
->ll_md_exp
, op_data
, itp
, &req
,
386 &ll_md_blocking_ast
, 0);
387 ll_finish_md_op_data(op_data
);
389 /* reason for keep own exit path - don`t flood log
390 * with messages with -ESTALE errors.
392 if (!it_disposition(itp
, DISP_OPEN_OPEN
) ||
393 it_open_error(DISP_OPEN_OPEN
, itp
))
395 ll_release_openhandle(inode
, itp
);
399 if (it_disposition(itp
, DISP_LOOKUP_NEG
)) {
404 if (rc
!= 0 || it_open_error(DISP_OPEN_OPEN
, itp
)) {
405 rc
= rc
? rc
: it_open_error(DISP_OPEN_OPEN
, itp
);
406 CDEBUG(D_VFSTRACE
, "lock enqueue: err: %d\n", rc
);
410 rc
= ll_prep_inode(&inode
, req
, NULL
, itp
);
411 if (!rc
&& itp
->it_lock_mode
)
412 ll_set_lock_data(sbi
->ll_md_exp
, inode
, itp
, NULL
);
415 ptlrpc_req_finished(req
);
416 ll_intent_drop_lock(itp
);
419 * We did open by fid, but by the time we got to the server,
420 * the object disappeared. If this is a create, we cannot really
421 * tell the userspace that the file it was trying to create
422 * does not exist. Instead let's return -ESTALE, and the VFS will
423 * retry the create with LOOKUP_REVAL that we are going to catch
424 * in ll_revalidate_dentry() and use lookup then.
426 if (rc
== -ENOENT
&& itp
->it_op
& IT_CREAT
)
432 static int ll_och_fill(struct obd_export
*md_exp
, struct lookup_intent
*it
,
433 struct obd_client_handle
*och
)
435 struct mdt_body
*body
;
437 body
= req_capsule_server_get(&it
->it_request
->rq_pill
, &RMF_MDT_BODY
);
438 och
->och_fh
= body
->mbo_handle
;
439 och
->och_fid
= body
->mbo_fid1
;
440 och
->och_lease_handle
.cookie
= it
->it_lock_handle
;
441 och
->och_magic
= OBD_CLIENT_HANDLE_MAGIC
;
442 och
->och_flags
= it
->it_flags
;
444 return md_set_open_replay_data(md_exp
, och
, it
);
447 static int ll_local_open(struct file
*file
, struct lookup_intent
*it
,
448 struct ll_file_data
*fd
, struct obd_client_handle
*och
)
450 struct inode
*inode
= file_inode(file
);
452 LASSERT(!LUSTRE_FPRIVATE(file
));
459 rc
= ll_och_fill(ll_i2sbi(inode
)->ll_md_exp
, it
, och
);
464 LUSTRE_FPRIVATE(file
) = fd
;
465 ll_readahead_init(inode
, &fd
->fd_ras
);
466 fd
->fd_omode
= it
->it_flags
& (FMODE_READ
| FMODE_WRITE
| FMODE_EXEC
);
468 /* ll_cl_context initialize */
469 rwlock_init(&fd
->fd_lock
);
470 INIT_LIST_HEAD(&fd
->fd_lccs
);
475 /* Open a file, and (for the very first open) create objects on the OSTs at
476 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
477 * creation or open until ll_lov_setstripe() ioctl is called.
479 * If we already have the stripe MD locally then we don't request it in
480 * md_open(), by passing a lmm_size = 0.
482 * It is up to the application to ensure no other processes open this file
483 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
484 * used. We might be able to avoid races of that sort by getting lli_open_sem
485 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
486 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
488 int ll_file_open(struct inode
*inode
, struct file
*file
)
490 struct ll_inode_info
*lli
= ll_i2info(inode
);
491 struct lookup_intent
*it
, oit
= { .it_op
= IT_OPEN
,
492 .it_flags
= file
->f_flags
};
493 struct obd_client_handle
**och_p
= NULL
;
494 __u64
*och_usecount
= NULL
;
495 struct ll_file_data
*fd
;
498 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p), flags %o\n",
499 PFID(ll_inode2fid(inode
)), inode
, file
->f_flags
);
501 it
= file
->private_data
; /* XXX: compat macro */
502 file
->private_data
= NULL
; /* prevent ll_local_open assertion */
504 fd
= ll_file_data_get();
511 if (S_ISDIR(inode
->i_mode
))
512 ll_authorize_statahead(inode
, fd
);
514 if (is_root_inode(inode
)) {
515 LUSTRE_FPRIVATE(file
) = fd
;
519 if (!it
|| !it
->it_disposition
) {
520 /* Convert f_flags into access mode. We cannot use file->f_mode,
521 * because everything but O_ACCMODE mask was stripped from
524 if ((oit
.it_flags
+ 1) & O_ACCMODE
)
526 if (file
->f_flags
& O_TRUNC
)
527 oit
.it_flags
|= FMODE_WRITE
;
529 /* kernel only call f_op->open in dentry_open. filp_open calls
530 * dentry_open after call to open_namei that checks permissions.
531 * Only nfsd_open call dentry_open directly without checking
532 * permissions and because of that this code below is safe.
534 if (oit
.it_flags
& (FMODE_WRITE
| FMODE_READ
))
535 oit
.it_flags
|= MDS_OPEN_OWNEROVERRIDE
;
537 /* We do not want O_EXCL here, presumably we opened the file
538 * already? XXX - NFS implications?
540 oit
.it_flags
&= ~O_EXCL
;
542 /* bug20584, if "it_flags" contains O_CREAT, the file will be
543 * created if necessary, then "IT_CREAT" should be set to keep
546 if (oit
.it_flags
& O_CREAT
)
547 oit
.it_op
|= IT_CREAT
;
553 /* Let's see if we have file open on MDS already. */
554 if (it
->it_flags
& FMODE_WRITE
) {
555 och_p
= &lli
->lli_mds_write_och
;
556 och_usecount
= &lli
->lli_open_fd_write_count
;
557 } else if (it
->it_flags
& FMODE_EXEC
) {
558 och_p
= &lli
->lli_mds_exec_och
;
559 och_usecount
= &lli
->lli_open_fd_exec_count
;
561 och_p
= &lli
->lli_mds_read_och
;
562 och_usecount
= &lli
->lli_open_fd_read_count
;
565 mutex_lock(&lli
->lli_och_mutex
);
566 if (*och_p
) { /* Open handle is present */
567 if (it_disposition(it
, DISP_OPEN_OPEN
)) {
568 /* Well, there's extra open request that we do not need,
569 * let's close it somehow. This will decref request.
571 rc
= it_open_error(DISP_OPEN_OPEN
, it
);
573 mutex_unlock(&lli
->lli_och_mutex
);
577 ll_release_openhandle(inode
, it
);
581 rc
= ll_local_open(file
, it
, fd
, NULL
);
584 mutex_unlock(&lli
->lli_och_mutex
);
588 LASSERT(*och_usecount
== 0);
589 if (!it
->it_disposition
) {
590 /* We cannot just request lock handle now, new ELC code
591 * means that one of other OPEN locks for this file
592 * could be cancelled, and since blocking ast handler
593 * would attempt to grab och_mutex as well, that would
594 * result in a deadlock
596 mutex_unlock(&lli
->lli_och_mutex
);
598 * Normally called under two situations:
600 * 2. revalidate with IT_OPEN (revalidate doesn't
601 * execute this intent any more).
603 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
605 * Always specify MDS_OPEN_BY_FID because we don't want
606 * to get file with different fid.
608 it
->it_flags
|= MDS_OPEN_LOCK
| MDS_OPEN_BY_FID
;
609 rc
= ll_intent_file_open(file
->f_path
.dentry
,
616 *och_p
= kzalloc(sizeof(struct obd_client_handle
), GFP_NOFS
);
624 /* md_intent_lock() didn't get a request ref if there was an
625 * open error, so don't do cleanup on the request here
628 /* XXX (green): Should not we bail out on any error here, not
631 rc
= it_open_error(DISP_OPEN_OPEN
, it
);
635 LASSERTF(it_disposition(it
, DISP_ENQ_OPEN_REF
),
636 "inode %p: disposition %x, status %d\n", inode
,
637 it_disposition(it
, ~0), it
->it_status
);
639 rc
= ll_local_open(file
, it
, fd
, *och_p
);
643 mutex_unlock(&lli
->lli_och_mutex
);
646 /* Must do this outside lli_och_mutex lock to prevent deadlock where
647 * different kind of OPEN lock for this same inode gets cancelled
650 if (!S_ISREG(inode
->i_mode
))
653 cl_lov_delay_create_clear(&file
->f_flags
);
658 if (och_p
&& *och_p
) {
663 mutex_unlock(&lli
->lli_och_mutex
);
666 if (lli
->lli_opendir_key
== fd
)
667 ll_deauthorize_statahead(inode
, fd
);
669 ll_file_data_put(fd
);
671 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_OPEN
, 1);
674 if (it
&& it_disposition(it
, DISP_ENQ_OPEN_REF
)) {
675 ptlrpc_req_finished(it
->it_request
);
676 it_clear_disposition(it
, DISP_ENQ_OPEN_REF
);
682 static int ll_md_blocking_lease_ast(struct ldlm_lock
*lock
,
683 struct ldlm_lock_desc
*desc
,
684 void *data
, int flag
)
687 struct lustre_handle lockh
;
690 case LDLM_CB_BLOCKING
:
691 ldlm_lock2handle(lock
, &lockh
);
692 rc
= ldlm_cli_cancel(&lockh
, LCF_ASYNC
);
694 CDEBUG(D_INODE
, "ldlm_cli_cancel: %d\n", rc
);
698 case LDLM_CB_CANCELING
:
706 * Acquire a lease and open the file.
708 static struct obd_client_handle
*
709 ll_lease_open(struct inode
*inode
, struct file
*file
, fmode_t fmode
,
712 struct lookup_intent it
= { .it_op
= IT_OPEN
};
713 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
714 struct md_op_data
*op_data
;
715 struct ptlrpc_request
*req
= NULL
;
716 struct lustre_handle old_handle
= { 0 };
717 struct obd_client_handle
*och
= NULL
;
721 if (fmode
!= FMODE_WRITE
&& fmode
!= FMODE_READ
)
722 return ERR_PTR(-EINVAL
);
725 struct ll_inode_info
*lli
= ll_i2info(inode
);
726 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
727 struct obd_client_handle
**och_p
;
730 if (!(fmode
& file
->f_mode
) || (file
->f_mode
& FMODE_EXEC
))
731 return ERR_PTR(-EPERM
);
733 /* Get the openhandle of the file */
735 mutex_lock(&lli
->lli_och_mutex
);
736 if (fd
->fd_lease_och
) {
737 mutex_unlock(&lli
->lli_och_mutex
);
742 if (file
->f_mode
& FMODE_WRITE
) {
743 LASSERT(lli
->lli_mds_write_och
);
744 och_p
= &lli
->lli_mds_write_och
;
745 och_usecount
= &lli
->lli_open_fd_write_count
;
747 LASSERT(lli
->lli_mds_read_och
);
748 och_p
= &lli
->lli_mds_read_och
;
749 och_usecount
= &lli
->lli_open_fd_read_count
;
751 if (*och_usecount
== 1) {
758 mutex_unlock(&lli
->lli_och_mutex
);
759 if (rc
< 0) /* more than 1 opener */
763 old_handle
= fd
->fd_och
->och_fh
;
766 och
= kzalloc(sizeof(*och
), GFP_NOFS
);
768 return ERR_PTR(-ENOMEM
);
770 op_data
= ll_prep_md_op_data(NULL
, inode
, inode
, NULL
, 0, 0,
771 LUSTRE_OPC_ANY
, NULL
);
772 if (IS_ERR(op_data
)) {
773 rc
= PTR_ERR(op_data
);
777 /* To tell the MDT this openhandle is from the same owner */
778 op_data
->op_handle
= old_handle
;
780 it
.it_flags
= fmode
| open_flags
;
781 it
.it_flags
|= MDS_OPEN_LOCK
| MDS_OPEN_BY_FID
| MDS_OPEN_LEASE
;
782 rc
= md_intent_lock(sbi
->ll_md_exp
, op_data
, &it
, &req
,
783 &ll_md_blocking_lease_ast
,
784 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
785 * it can be cancelled which may mislead applications that the lease is
787 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
788 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
789 * doesn't deal with openhandle, so normal openhandle will be leaked.
791 LDLM_FL_NO_LRU
| LDLM_FL_EXCL
);
792 ll_finish_md_op_data(op_data
);
793 ptlrpc_req_finished(req
);
797 if (it_disposition(&it
, DISP_LOOKUP_NEG
)) {
802 rc
= it_open_error(DISP_OPEN_OPEN
, &it
);
806 LASSERT(it_disposition(&it
, DISP_ENQ_OPEN_REF
));
807 ll_och_fill(sbi
->ll_md_exp
, &it
, och
);
809 if (!it_disposition(&it
, DISP_OPEN_LEASE
)) /* old server? */ {
814 /* already get lease, handle lease lock */
815 ll_set_lock_data(sbi
->ll_md_exp
, inode
, &it
, NULL
);
816 if (it
.it_lock_mode
== 0 ||
817 it
.it_lock_bits
!= MDS_INODELOCK_OPEN
) {
818 /* open lock must return for lease */
819 CERROR(DFID
"lease granted but no open lock, %d/%llu.\n",
820 PFID(ll_inode2fid(inode
)), it
.it_lock_mode
,
826 ll_intent_release(&it
);
830 /* Cancel open lock */
831 if (it
.it_lock_mode
!= 0) {
832 ldlm_lock_decref_and_cancel(&och
->och_lease_handle
,
835 och
->och_lease_handle
.cookie
= 0ULL;
837 rc2
= ll_close_inode_openhandle(inode
, och
, 0, NULL
);
839 CERROR("%s: error closing file " DFID
": %d\n",
840 ll_get_fsname(inode
->i_sb
, NULL
, 0),
841 PFID(&ll_i2info(inode
)->lli_fid
), rc2
);
842 och
= NULL
; /* och has been freed in ll_close_inode_openhandle() */
844 ll_intent_release(&it
);
851 * Check whether a layout swap can be done between two inodes.
853 * \param[in] inode1 First inode to check
854 * \param[in] inode2 Second inode to check
856 * \retval 0 on success, layout swap can be performed between both inodes
857 * \retval negative error code if requirements are not met
859 static int ll_check_swap_layouts_validity(struct inode
*inode1
,
860 struct inode
*inode2
)
862 if (!S_ISREG(inode1
->i_mode
) || !S_ISREG(inode2
->i_mode
))
865 if (inode_permission(inode1
, MAY_WRITE
) ||
866 inode_permission(inode2
, MAY_WRITE
))
869 if (inode1
->i_sb
!= inode2
->i_sb
)
875 static int ll_swap_layouts_close(struct obd_client_handle
*och
,
876 struct inode
*inode
, struct inode
*inode2
)
878 const struct lu_fid
*fid1
= ll_inode2fid(inode
);
879 const struct lu_fid
*fid2
;
882 CDEBUG(D_INODE
, "%s: biased close of file " DFID
"\n",
883 ll_get_fsname(inode
->i_sb
, NULL
, 0), PFID(fid1
));
885 rc
= ll_check_swap_layouts_validity(inode
, inode2
);
889 /* We now know that inode2 is a lustre inode */
890 fid2
= ll_inode2fid(inode2
);
892 rc
= lu_fid_cmp(fid1
, fid2
);
899 * Close the file and swap layouts between inode & inode2.
900 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
901 * because we still need it to pack l_remote_handle to MDT.
903 rc
= ll_close_inode_openhandle(inode
, och
, MDS_CLOSE_LAYOUT_SWAP
,
906 och
= NULL
; /* freed in ll_close_inode_openhandle() */
914 * Release lease and close the file.
915 * It will check if the lease has ever broken.
917 static int ll_lease_close(struct obd_client_handle
*och
, struct inode
*inode
,
920 struct ldlm_lock
*lock
;
921 bool cancelled
= true;
923 lock
= ldlm_handle2lock(&och
->och_lease_handle
);
925 lock_res_and_lock(lock
);
926 cancelled
= ldlm_is_cancel(lock
);
927 unlock_res_and_lock(lock
);
931 CDEBUG(D_INODE
, "lease for " DFID
" broken? %d\n",
932 PFID(&ll_i2info(inode
)->lli_fid
), cancelled
);
935 ldlm_cli_cancel(&och
->och_lease_handle
, 0);
937 *lease_broken
= cancelled
;
939 return ll_close_inode_openhandle(inode
, och
, 0, NULL
);
942 int ll_merge_attr(const struct lu_env
*env
, struct inode
*inode
)
944 struct ll_inode_info
*lli
= ll_i2info(inode
);
945 struct cl_object
*obj
= lli
->lli_clob
;
946 struct cl_attr
*attr
= vvp_env_thread_attr(env
);
952 ll_inode_size_lock(inode
);
954 /* merge timestamps the most recently obtained from mds with
955 * timestamps obtained from osts
957 LTIME_S(inode
->i_atime
) = lli
->lli_atime
;
958 LTIME_S(inode
->i_mtime
) = lli
->lli_mtime
;
959 LTIME_S(inode
->i_ctime
) = lli
->lli_ctime
;
961 mtime
= LTIME_S(inode
->i_mtime
);
962 atime
= LTIME_S(inode
->i_atime
);
963 ctime
= LTIME_S(inode
->i_ctime
);
965 cl_object_attr_lock(obj
);
966 rc
= cl_object_attr_get(env
, obj
, attr
);
967 cl_object_attr_unlock(obj
);
970 goto out_size_unlock
;
972 if (atime
< attr
->cat_atime
)
973 atime
= attr
->cat_atime
;
975 if (ctime
< attr
->cat_ctime
)
976 ctime
= attr
->cat_ctime
;
978 if (mtime
< attr
->cat_mtime
)
979 mtime
= attr
->cat_mtime
;
981 CDEBUG(D_VFSTRACE
, DFID
" updating i_size %llu\n",
982 PFID(&lli
->lli_fid
), attr
->cat_size
);
984 i_size_write(inode
, attr
->cat_size
);
986 inode
->i_blocks
= attr
->cat_blocks
;
988 LTIME_S(inode
->i_mtime
) = mtime
;
989 LTIME_S(inode
->i_atime
) = atime
;
990 LTIME_S(inode
->i_ctime
) = ctime
;
993 ll_inode_size_unlock(inode
);
998 static bool file_is_noatime(const struct file
*file
)
1000 const struct vfsmount
*mnt
= file
->f_path
.mnt
;
1001 const struct inode
*inode
= file_inode(file
);
1003 /* Adapted from file_accessed() and touch_atime().*/
1004 if (file
->f_flags
& O_NOATIME
)
1007 if (inode
->i_flags
& S_NOATIME
)
1010 if (IS_NOATIME(inode
))
1013 if (mnt
->mnt_flags
& (MNT_NOATIME
| MNT_READONLY
))
1016 if ((mnt
->mnt_flags
& MNT_NODIRATIME
) && S_ISDIR(inode
->i_mode
))
1019 if ((inode
->i_sb
->s_flags
& SB_NODIRATIME
) && S_ISDIR(inode
->i_mode
))
1025 static void ll_io_init(struct cl_io
*io
, const struct file
*file
, int write
)
1027 struct inode
*inode
= file_inode(file
);
1029 io
->u
.ci_rw
.crw_nonblock
= file
->f_flags
& O_NONBLOCK
;
1031 io
->u
.ci_wr
.wr_append
= !!(file
->f_flags
& O_APPEND
);
1032 io
->u
.ci_wr
.wr_sync
= file
->f_flags
& O_SYNC
||
1033 file
->f_flags
& O_DIRECT
||
1036 io
->ci_obj
= ll_i2info(inode
)->lli_clob
;
1037 io
->ci_lockreq
= CILR_MAYBE
;
1038 if (ll_file_nolock(file
)) {
1039 io
->ci_lockreq
= CILR_NEVER
;
1040 io
->ci_no_srvlock
= 1;
1041 } else if (file
->f_flags
& O_APPEND
) {
1042 io
->ci_lockreq
= CILR_MANDATORY
;
1045 io
->ci_noatime
= file_is_noatime(file
);
1049 ll_file_io_generic(const struct lu_env
*env
, struct vvp_io_args
*args
,
1050 struct file
*file
, enum cl_io_type iot
,
1051 loff_t
*ppos
, size_t count
)
1053 struct ll_inode_info
*lli
= ll_i2info(file_inode(file
));
1054 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1055 struct vvp_io
*vio
= vvp_env_io(env
);
1056 struct range_lock range
;
1061 CDEBUG(D_VFSTRACE
, "file: %pD, type: %d ppos: %llu, count: %zu\n",
1062 file
, iot
, *ppos
, count
);
1065 io
= vvp_env_thread_io(env
);
1066 ll_io_init(io
, file
, iot
== CIT_WRITE
);
1068 if (cl_io_rw_init(env
, io
, iot
, *ppos
, count
) == 0) {
1069 struct vvp_io
*vio
= vvp_env_io(env
);
1070 bool range_locked
= false;
1072 if (file
->f_flags
& O_APPEND
)
1073 range_lock_init(&range
, 0, LUSTRE_EOF
);
1075 range_lock_init(&range
, *ppos
, *ppos
+ count
- 1);
1077 vio
->vui_fd
= LUSTRE_FPRIVATE(file
);
1078 vio
->vui_iter
= args
->u
.normal
.via_iter
;
1079 vio
->vui_iocb
= args
->u
.normal
.via_iocb
;
1081 * Direct IO reads must also take range lock,
1082 * or multiple reads will try to work on the same pages
1083 * See LU-6227 for details.
1085 if (((iot
== CIT_WRITE
) ||
1086 (iot
== CIT_READ
&& (file
->f_flags
& O_DIRECT
))) &&
1087 !(vio
->vui_fd
->fd_flags
& LL_FILE_GROUP_LOCKED
)) {
1088 CDEBUG(D_VFSTRACE
, "Range lock [%llu, %llu]\n",
1089 range
.rl_node
.in_extent
.start
,
1090 range
.rl_node
.in_extent
.end
);
1091 rc
= range_lock(&lli
->lli_write_tree
, &range
);
1095 range_locked
= true;
1097 ll_cl_add(file
, env
, io
);
1098 rc
= cl_io_loop(env
, io
);
1099 ll_cl_remove(file
, env
);
1101 CDEBUG(D_VFSTRACE
, "Range unlock [%llu, %llu]\n",
1102 range
.rl_node
.in_extent
.start
,
1103 range
.rl_node
.in_extent
.end
);
1104 range_unlock(&lli
->lli_write_tree
, &range
);
1107 /* cl_io_rw_init() handled IO */
1111 if (io
->ci_nob
> 0) {
1112 result
= io
->ci_nob
;
1113 count
-= io
->ci_nob
;
1114 *ppos
= io
->u
.ci_wr
.wr
.crw_pos
;
1116 /* prepare IO restart */
1118 args
->u
.normal
.via_iter
= vio
->vui_iter
;
1121 cl_io_fini(env
, io
);
1123 if ((!rc
|| rc
== -ENODATA
) && count
> 0 && io
->ci_need_restart
) {
1125 "%s: restart %s from %lld, count:%zu, result: %zd\n",
1126 file_dentry(file
)->d_name
.name
,
1127 iot
== CIT_READ
? "read" : "write",
1128 *ppos
, count
, result
);
1132 if (iot
== CIT_READ
) {
1134 ll_stats_ops_tally(ll_i2sbi(file_inode(file
)),
1135 LPROC_LL_READ_BYTES
, result
);
1136 } else if (iot
== CIT_WRITE
) {
1138 ll_stats_ops_tally(ll_i2sbi(file_inode(file
)),
1139 LPROC_LL_WRITE_BYTES
, result
);
1140 fd
->fd_write_failed
= false;
1141 } else if (!result
&& !rc
) {
1144 fd
->fd_write_failed
= true;
1146 fd
->fd_write_failed
= false;
1147 } else if (rc
!= -ERESTARTSYS
) {
1148 fd
->fd_write_failed
= true;
1151 CDEBUG(D_VFSTRACE
, "iot: %d, result: %zd\n", iot
, result
);
1153 return result
> 0 ? result
: rc
;
1156 static ssize_t
ll_file_read_iter(struct kiocb
*iocb
, struct iov_iter
*to
)
1159 struct vvp_io_args
*args
;
1163 env
= cl_env_get(&refcheck
);
1165 return PTR_ERR(env
);
1167 args
= ll_env_args(env
);
1168 args
->u
.normal
.via_iter
= to
;
1169 args
->u
.normal
.via_iocb
= iocb
;
1171 result
= ll_file_io_generic(env
, args
, iocb
->ki_filp
, CIT_READ
,
1172 &iocb
->ki_pos
, iov_iter_count(to
));
1173 cl_env_put(env
, &refcheck
);
1178 * Write to a file (through the page cache).
1180 static ssize_t
ll_file_write_iter(struct kiocb
*iocb
, struct iov_iter
*from
)
1183 struct vvp_io_args
*args
;
1187 env
= cl_env_get(&refcheck
);
1189 return PTR_ERR(env
);
1191 args
= ll_env_args(env
);
1192 args
->u
.normal
.via_iter
= from
;
1193 args
->u
.normal
.via_iocb
= iocb
;
1195 result
= ll_file_io_generic(env
, args
, iocb
->ki_filp
, CIT_WRITE
,
1196 &iocb
->ki_pos
, iov_iter_count(from
));
1197 cl_env_put(env
, &refcheck
);
1201 int ll_lov_setstripe_ea_info(struct inode
*inode
, struct dentry
*dentry
,
1202 __u64 flags
, struct lov_user_md
*lum
,
1205 struct lookup_intent oit
= {
1207 .it_flags
= flags
| MDS_OPEN_BY_FID
,
1211 ll_inode_size_lock(inode
);
1212 rc
= ll_intent_file_open(dentry
, lum
, lum_size
, &oit
);
1216 ll_release_openhandle(inode
, &oit
);
1219 ll_inode_size_unlock(inode
);
1220 ll_intent_release(&oit
);
1224 int ll_lov_getstripe_ea_info(struct inode
*inode
, const char *filename
,
1225 struct lov_mds_md
**lmmp
, int *lmm_size
,
1226 struct ptlrpc_request
**request
)
1228 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
1229 struct mdt_body
*body
;
1230 struct lov_mds_md
*lmm
= NULL
;
1231 struct ptlrpc_request
*req
= NULL
;
1232 struct md_op_data
*op_data
;
1235 rc
= ll_get_default_mdsize(sbi
, &lmmsize
);
1239 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, filename
,
1240 strlen(filename
), lmmsize
,
1241 LUSTRE_OPC_ANY
, NULL
);
1242 if (IS_ERR(op_data
))
1243 return PTR_ERR(op_data
);
1245 op_data
->op_valid
= OBD_MD_FLEASIZE
| OBD_MD_FLDIREA
;
1246 rc
= md_getattr_name(sbi
->ll_md_exp
, op_data
, &req
);
1247 ll_finish_md_op_data(op_data
);
1249 CDEBUG(D_INFO
, "md_getattr_name failed on %s: rc %d\n",
1254 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
1256 lmmsize
= body
->mbo_eadatasize
;
1258 if (!(body
->mbo_valid
& (OBD_MD_FLEASIZE
| OBD_MD_FLDIREA
)) ||
1264 lmm
= req_capsule_server_sized_get(&req
->rq_pill
, &RMF_MDT_MD
, lmmsize
);
1266 if ((lmm
->lmm_magic
!= cpu_to_le32(LOV_MAGIC_V1
)) &&
1267 (lmm
->lmm_magic
!= cpu_to_le32(LOV_MAGIC_V3
))) {
1273 * This is coming from the MDS, so is probably in
1274 * little endian. We convert it to host endian before
1275 * passing it to userspace.
1277 if (cpu_to_le32(LOV_MAGIC
) != LOV_MAGIC
) {
1280 stripe_count
= le16_to_cpu(lmm
->lmm_stripe_count
);
1281 if (le32_to_cpu(lmm
->lmm_pattern
) & LOV_PATTERN_F_RELEASED
)
1284 /* if function called for directory - we should
1285 * avoid swab not existent lsm objects
1287 if (lmm
->lmm_magic
== cpu_to_le32(LOV_MAGIC_V1
)) {
1288 lustre_swab_lov_user_md_v1((struct lov_user_md_v1
*)lmm
);
1289 if (S_ISREG(body
->mbo_mode
))
1290 lustre_swab_lov_user_md_objects(
1291 ((struct lov_user_md_v1
*)lmm
)->lmm_objects
,
1293 } else if (lmm
->lmm_magic
== cpu_to_le32(LOV_MAGIC_V3
)) {
1294 lustre_swab_lov_user_md_v3((struct lov_user_md_v3
*)lmm
);
1295 if (S_ISREG(body
->mbo_mode
))
1296 lustre_swab_lov_user_md_objects(
1297 ((struct lov_user_md_v3
*)lmm
)->lmm_objects
,
1304 *lmm_size
= lmmsize
;
1309 static int ll_lov_setea(struct inode
*inode
, struct file
*file
,
1312 __u64 flags
= MDS_OPEN_HAS_OBJS
| FMODE_WRITE
;
1313 struct lov_user_md
*lump
;
1314 int lum_size
= sizeof(struct lov_user_md
) +
1315 sizeof(struct lov_user_ost_data
);
1318 if (!capable(CFS_CAP_SYS_ADMIN
))
1321 lump
= libcfs_kvzalloc(lum_size
, GFP_NOFS
);
1325 if (copy_from_user(lump
, (struct lov_user_md __user
*)arg
, lum_size
)) {
1330 rc
= ll_lov_setstripe_ea_info(inode
, file
->f_path
.dentry
, flags
, lump
,
1332 cl_lov_delay_create_clear(&file
->f_flags
);
1338 static int ll_file_getstripe(struct inode
*inode
,
1339 struct lov_user_md __user
*lum
)
1345 env
= cl_env_get(&refcheck
);
1347 return PTR_ERR(env
);
1349 rc
= cl_object_getstripe(env
, ll_i2info(inode
)->lli_clob
, lum
);
1350 cl_env_put(env
, &refcheck
);
1354 static int ll_lov_setstripe(struct inode
*inode
, struct file
*file
,
1357 struct lov_user_md __user
*lum
= (struct lov_user_md __user
*)arg
;
1358 struct lov_user_md
*klum
;
1360 __u64 flags
= FMODE_WRITE
;
1362 rc
= ll_copy_user_md(lum
, &klum
);
1367 rc
= ll_lov_setstripe_ea_info(inode
, file
->f_path
.dentry
, flags
, klum
,
1369 cl_lov_delay_create_clear(&file
->f_flags
);
1373 put_user(0, &lum
->lmm_stripe_count
);
1375 ll_layout_refresh(inode
, &gen
);
1376 rc
= ll_file_getstripe(inode
, (struct lov_user_md __user
*)arg
);
1384 ll_get_grouplock(struct inode
*inode
, struct file
*file
, unsigned long arg
)
1386 struct ll_inode_info
*lli
= ll_i2info(inode
);
1387 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1388 struct ll_grouplock grouplock
;
1392 CWARN("group id for group lock must not be 0\n");
1396 if (ll_file_nolock(file
))
1399 spin_lock(&lli
->lli_lock
);
1400 if (fd
->fd_flags
& LL_FILE_GROUP_LOCKED
) {
1401 CWARN("group lock already existed with gid %lu\n",
1402 fd
->fd_grouplock
.lg_gid
);
1403 spin_unlock(&lli
->lli_lock
);
1406 LASSERT(!fd
->fd_grouplock
.lg_lock
);
1407 spin_unlock(&lli
->lli_lock
);
1409 rc
= cl_get_grouplock(ll_i2info(inode
)->lli_clob
,
1410 arg
, (file
->f_flags
& O_NONBLOCK
), &grouplock
);
1414 spin_lock(&lli
->lli_lock
);
1415 if (fd
->fd_flags
& LL_FILE_GROUP_LOCKED
) {
1416 spin_unlock(&lli
->lli_lock
);
1417 CERROR("another thread just won the race\n");
1418 cl_put_grouplock(&grouplock
);
1422 fd
->fd_flags
|= LL_FILE_GROUP_LOCKED
;
1423 fd
->fd_grouplock
= grouplock
;
1424 spin_unlock(&lli
->lli_lock
);
1426 CDEBUG(D_INFO
, "group lock %lu obtained\n", arg
);
1430 static int ll_put_grouplock(struct inode
*inode
, struct file
*file
,
1433 struct ll_inode_info
*lli
= ll_i2info(inode
);
1434 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1435 struct ll_grouplock grouplock
;
1437 spin_lock(&lli
->lli_lock
);
1438 if (!(fd
->fd_flags
& LL_FILE_GROUP_LOCKED
)) {
1439 spin_unlock(&lli
->lli_lock
);
1440 CWARN("no group lock held\n");
1443 LASSERT(fd
->fd_grouplock
.lg_lock
);
1445 if (fd
->fd_grouplock
.lg_gid
!= arg
) {
1446 CWARN("group lock %lu doesn't match current id %lu\n",
1447 arg
, fd
->fd_grouplock
.lg_gid
);
1448 spin_unlock(&lli
->lli_lock
);
1452 grouplock
= fd
->fd_grouplock
;
1453 memset(&fd
->fd_grouplock
, 0, sizeof(fd
->fd_grouplock
));
1454 fd
->fd_flags
&= ~LL_FILE_GROUP_LOCKED
;
1455 spin_unlock(&lli
->lli_lock
);
1457 cl_put_grouplock(&grouplock
);
1458 CDEBUG(D_INFO
, "group lock %lu released\n", arg
);
1463 * Close inode open handle
1465 * \param inode [in] inode in question
1466 * \param it [in,out] intent which contains open info and result
1469 * \retval <0 failure
1471 int ll_release_openhandle(struct inode
*inode
, struct lookup_intent
*it
)
1473 struct obd_client_handle
*och
;
1478 /* Root ? Do nothing. */
1479 if (is_root_inode(inode
))
1482 /* No open handle to close? Move away */
1483 if (!it_disposition(it
, DISP_OPEN_OPEN
))
1486 LASSERT(it_open_error(DISP_OPEN_OPEN
, it
) == 0);
1488 och
= kzalloc(sizeof(*och
), GFP_NOFS
);
1494 ll_och_fill(ll_i2sbi(inode
)->ll_md_exp
, it
, och
);
1496 rc
= ll_close_inode_openhandle(inode
, och
, 0, NULL
);
1498 /* this one is in place of ll_file_open */
1499 if (it_disposition(it
, DISP_ENQ_OPEN_REF
)) {
1500 ptlrpc_req_finished(it
->it_request
);
1501 it_clear_disposition(it
, DISP_ENQ_OPEN_REF
);
1507 * Get size for inode for which FIEMAP mapping is requested.
1508 * Make the FIEMAP get_info call and returns the result.
1510 * \param fiemap kernel buffer to hold extens
1511 * \param num_bytes kernel buffer size
1513 static int ll_do_fiemap(struct inode
*inode
, struct fiemap
*fiemap
,
1516 struct ll_fiemap_info_key fmkey
= { .lfik_name
= KEY_FIEMAP
, };
1521 /* Checks for fiemap flags */
1522 if (fiemap
->fm_flags
& ~LUSTRE_FIEMAP_FLAGS_COMPAT
) {
1523 fiemap
->fm_flags
&= ~LUSTRE_FIEMAP_FLAGS_COMPAT
;
1527 /* Check for FIEMAP_FLAG_SYNC */
1528 if (fiemap
->fm_flags
& FIEMAP_FLAG_SYNC
) {
1529 rc
= filemap_fdatawrite(inode
->i_mapping
);
1534 env
= cl_env_get(&refcheck
);
1536 return PTR_ERR(env
);
1538 if (i_size_read(inode
) == 0) {
1539 rc
= ll_glimpse_size(inode
);
1544 fmkey
.lfik_oa
.o_valid
= OBD_MD_FLID
| OBD_MD_FLGROUP
;
1545 obdo_from_inode(&fmkey
.lfik_oa
, inode
, OBD_MD_FLSIZE
);
1546 obdo_set_parent_fid(&fmkey
.lfik_oa
, &ll_i2info(inode
)->lli_fid
);
1548 /* If filesize is 0, then there would be no objects for mapping */
1549 if (fmkey
.lfik_oa
.o_size
== 0) {
1550 fiemap
->fm_mapped_extents
= 0;
1555 memcpy(&fmkey
.lfik_fiemap
, fiemap
, sizeof(*fiemap
));
1557 rc
= cl_object_fiemap(env
, ll_i2info(inode
)->lli_clob
,
1558 &fmkey
, fiemap
, &num_bytes
);
1560 cl_env_put(env
, &refcheck
);
1564 int ll_fid2path(struct inode
*inode
, void __user
*arg
)
1566 struct obd_export
*exp
= ll_i2mdexp(inode
);
1567 const struct getinfo_fid2path __user
*gfin
= arg
;
1568 struct getinfo_fid2path
*gfout
;
1573 if (!capable(CFS_CAP_DAC_READ_SEARCH
) &&
1574 !(ll_i2sbi(inode
)->ll_flags
& LL_SBI_USER_FID2PATH
))
1577 /* Only need to get the buflen */
1578 if (get_user(pathlen
, &gfin
->gf_pathlen
))
1581 if (pathlen
> PATH_MAX
)
1584 outsize
= sizeof(*gfout
) + pathlen
;
1586 gfout
= kzalloc(outsize
, GFP_NOFS
);
1590 if (copy_from_user(gfout
, arg
, sizeof(*gfout
))) {
1595 /* Call mdc_iocontrol */
1596 rc
= obd_iocontrol(OBD_IOC_FID2PATH
, exp
, outsize
, gfout
, NULL
);
1600 if (copy_to_user(arg
, gfout
, outsize
))
1609 * Read the data_version for inode.
1611 * This value is computed using stripe object version on OST.
1612 * Version is computed using server side locking.
1614 * @param flags if do sync on the OST side;
1616 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1617 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1619 int ll_data_version(struct inode
*inode
, __u64
*data_version
, int flags
)
1621 struct cl_object
*obj
= ll_i2info(inode
)->lli_clob
;
1627 /* If no file object initialized, we consider its version is 0. */
1633 env
= cl_env_get(&refcheck
);
1635 return PTR_ERR(env
);
1637 io
= vvp_env_thread_io(env
);
1639 io
->u
.ci_data_version
.dv_data_version
= 0;
1640 io
->u
.ci_data_version
.dv_flags
= flags
;
1643 if (!cl_io_init(env
, io
, CIT_DATA_VERSION
, io
->ci_obj
))
1644 result
= cl_io_loop(env
, io
);
1646 result
= io
->ci_result
;
1648 *data_version
= io
->u
.ci_data_version
.dv_data_version
;
1650 cl_io_fini(env
, io
);
1652 if (unlikely(io
->ci_need_restart
))
1655 cl_env_put(env
, &refcheck
);
1661 * Trigger a HSM release request for the provided inode.
1663 int ll_hsm_release(struct inode
*inode
)
1666 struct obd_client_handle
*och
= NULL
;
1667 __u64 data_version
= 0;
1671 CDEBUG(D_INODE
, "%s: Releasing file " DFID
".\n",
1672 ll_get_fsname(inode
->i_sb
, NULL
, 0),
1673 PFID(&ll_i2info(inode
)->lli_fid
));
1675 och
= ll_lease_open(inode
, NULL
, FMODE_WRITE
, MDS_OPEN_RELEASE
);
1681 /* Grab latest data_version and [am]time values */
1682 rc
= ll_data_version(inode
, &data_version
, LL_DV_WR_FLUSH
);
1686 env
= cl_env_get(&refcheck
);
1692 ll_merge_attr(env
, inode
);
1693 cl_env_put(env
, &refcheck
);
1695 /* Release the file.
1696 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1697 * we still need it to pack l_remote_handle to MDT.
1699 rc
= ll_close_inode_openhandle(inode
, och
, MDS_HSM_RELEASE
,
1704 if (och
&& !IS_ERR(och
)) /* close the file */
1705 ll_lease_close(och
, inode
, NULL
);
1710 struct ll_swap_stack
{
1713 struct inode
*inode1
;
1714 struct inode
*inode2
;
1719 static int ll_swap_layouts(struct file
*file1
, struct file
*file2
,
1720 struct lustre_swap_layouts
*lsl
)
1722 struct mdc_swap_layouts msl
;
1723 struct md_op_data
*op_data
;
1726 struct ll_swap_stack
*llss
= NULL
;
1729 llss
= kzalloc(sizeof(*llss
), GFP_NOFS
);
1733 llss
->inode1
= file_inode(file1
);
1734 llss
->inode2
= file_inode(file2
);
1736 rc
= ll_check_swap_layouts_validity(llss
->inode1
, llss
->inode2
);
1740 /* we use 2 bool because it is easier to swap than 2 bits */
1741 if (lsl
->sl_flags
& SWAP_LAYOUTS_CHECK_DV1
)
1742 llss
->check_dv1
= true;
1744 if (lsl
->sl_flags
& SWAP_LAYOUTS_CHECK_DV2
)
1745 llss
->check_dv2
= true;
1747 /* we cannot use lsl->sl_dvX directly because we may swap them */
1748 llss
->dv1
= lsl
->sl_dv1
;
1749 llss
->dv2
= lsl
->sl_dv2
;
1751 rc
= lu_fid_cmp(ll_inode2fid(llss
->inode1
), ll_inode2fid(llss
->inode2
));
1752 if (!rc
) /* same file, done! */
1755 if (rc
< 0) { /* sequentialize it */
1756 swap(llss
->inode1
, llss
->inode2
);
1758 swap(llss
->dv1
, llss
->dv2
);
1759 swap(llss
->check_dv1
, llss
->check_dv2
);
1763 if (gid
!= 0) { /* application asks to flush dirty cache */
1764 rc
= ll_get_grouplock(llss
->inode1
, file1
, gid
);
1768 rc
= ll_get_grouplock(llss
->inode2
, file2
, gid
);
1770 ll_put_grouplock(llss
->inode1
, file1
, gid
);
1775 /* ultimate check, before swapping the layouts we check if
1776 * dataversion has changed (if requested)
1778 if (llss
->check_dv1
) {
1779 rc
= ll_data_version(llss
->inode1
, &dv
, 0);
1782 if (dv
!= llss
->dv1
) {
1788 if (llss
->check_dv2
) {
1789 rc
= ll_data_version(llss
->inode2
, &dv
, 0);
1792 if (dv
!= llss
->dv2
) {
1798 /* struct md_op_data is used to send the swap args to the mdt
1799 * only flags is missing, so we use struct mdc_swap_layouts
1800 * through the md_op_data->op_data
1802 /* flags from user space have to be converted before they are send to
1803 * server, no flag is sent today, they are only used on the client
1807 op_data
= ll_prep_md_op_data(NULL
, llss
->inode1
, llss
->inode2
, NULL
, 0,
1808 0, LUSTRE_OPC_ANY
, &msl
);
1809 if (IS_ERR(op_data
)) {
1810 rc
= PTR_ERR(op_data
);
1814 rc
= obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS
, ll_i2mdexp(llss
->inode1
),
1815 sizeof(*op_data
), op_data
, NULL
);
1816 ll_finish_md_op_data(op_data
);
1820 ll_put_grouplock(llss
->inode2
, file2
, gid
);
1821 ll_put_grouplock(llss
->inode1
, file1
, gid
);
1830 int ll_hsm_state_set(struct inode
*inode
, struct hsm_state_set
*hss
)
1832 struct md_op_data
*op_data
;
1835 /* Detect out-of range masks */
1836 if ((hss
->hss_setmask
| hss
->hss_clearmask
) & ~HSM_FLAGS_MASK
)
1839 /* Non-root users are forbidden to set or clear flags which are
1840 * NOT defined in HSM_USER_MASK.
1842 if (((hss
->hss_setmask
| hss
->hss_clearmask
) & ~HSM_USER_MASK
) &&
1843 !capable(CFS_CAP_SYS_ADMIN
))
1846 /* Detect out-of range archive id */
1847 if ((hss
->hss_valid
& HSS_ARCHIVE_ID
) &&
1848 (hss
->hss_archive_id
> LL_HSM_MAX_ARCHIVE
))
1851 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
1852 LUSTRE_OPC_ANY
, hss
);
1853 if (IS_ERR(op_data
))
1854 return PTR_ERR(op_data
);
1856 rc
= obd_iocontrol(LL_IOC_HSM_STATE_SET
, ll_i2mdexp(inode
),
1857 sizeof(*op_data
), op_data
, NULL
);
1859 ll_finish_md_op_data(op_data
);
1864 static int ll_hsm_import(struct inode
*inode
, struct file
*file
,
1865 struct hsm_user_import
*hui
)
1867 struct hsm_state_set
*hss
= NULL
;
1868 struct iattr
*attr
= NULL
;
1871 if (!S_ISREG(inode
->i_mode
))
1875 hss
= kzalloc(sizeof(*hss
), GFP_NOFS
);
1879 hss
->hss_valid
= HSS_SETMASK
| HSS_ARCHIVE_ID
;
1880 hss
->hss_archive_id
= hui
->hui_archive_id
;
1881 hss
->hss_setmask
= HS_ARCHIVED
| HS_EXISTS
| HS_RELEASED
;
1882 rc
= ll_hsm_state_set(inode
, hss
);
1886 attr
= kzalloc(sizeof(*attr
), GFP_NOFS
);
1892 attr
->ia_mode
= hui
->hui_mode
& 0777;
1893 attr
->ia_mode
|= S_IFREG
;
1894 attr
->ia_uid
= make_kuid(&init_user_ns
, hui
->hui_uid
);
1895 attr
->ia_gid
= make_kgid(&init_user_ns
, hui
->hui_gid
);
1896 attr
->ia_size
= hui
->hui_size
;
1897 attr
->ia_mtime
.tv_sec
= hui
->hui_mtime
;
1898 attr
->ia_mtime
.tv_nsec
= hui
->hui_mtime_ns
;
1899 attr
->ia_atime
.tv_sec
= hui
->hui_atime
;
1900 attr
->ia_atime
.tv_nsec
= hui
->hui_atime_ns
;
1902 attr
->ia_valid
= ATTR_SIZE
| ATTR_MODE
| ATTR_FORCE
|
1903 ATTR_UID
| ATTR_GID
|
1904 ATTR_MTIME
| ATTR_MTIME_SET
|
1905 ATTR_ATIME
| ATTR_ATIME_SET
;
1909 rc
= ll_setattr_raw(file
->f_path
.dentry
, attr
, true);
1913 inode_unlock(inode
);
1921 static inline long ll_lease_type_from_fmode(fmode_t fmode
)
1923 return ((fmode
& FMODE_READ
) ? LL_LEASE_RDLCK
: 0) |
1924 ((fmode
& FMODE_WRITE
) ? LL_LEASE_WRLCK
: 0);
1928 ll_file_ioctl(struct file
*file
, unsigned int cmd
, unsigned long arg
)
1930 struct inode
*inode
= file_inode(file
);
1931 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1934 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p),cmd=%x\n",
1935 PFID(ll_inode2fid(inode
)), inode
, cmd
);
1936 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_IOCTL
, 1);
1938 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1939 if (_IOC_TYPE(cmd
) == 'T' || _IOC_TYPE(cmd
) == 't') /* tty ioctls */
1943 case LL_IOC_GETFLAGS
:
1944 /* Get the current value of the file flags */
1945 return put_user(fd
->fd_flags
, (int __user
*)arg
);
1946 case LL_IOC_SETFLAGS
:
1947 case LL_IOC_CLRFLAGS
:
1948 /* Set or clear specific file flags */
1949 /* XXX This probably needs checks to ensure the flags are
1950 * not abused, and to handle any flag side effects.
1952 if (get_user(flags
, (int __user
*)arg
))
1955 if (cmd
== LL_IOC_SETFLAGS
) {
1956 if ((flags
& LL_FILE_IGNORE_LOCK
) &&
1957 !(file
->f_flags
& O_DIRECT
)) {
1958 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
1963 fd
->fd_flags
|= flags
;
1965 fd
->fd_flags
&= ~flags
;
1968 case LL_IOC_LOV_SETSTRIPE
:
1969 return ll_lov_setstripe(inode
, file
, arg
);
1970 case LL_IOC_LOV_SETEA
:
1971 return ll_lov_setea(inode
, file
, arg
);
1972 case LL_IOC_LOV_SWAP_LAYOUTS
: {
1974 struct lustre_swap_layouts lsl
;
1976 if (copy_from_user(&lsl
, (char __user
*)arg
,
1977 sizeof(struct lustre_swap_layouts
)))
1980 if ((file
->f_flags
& O_ACCMODE
) == O_RDONLY
)
1983 file2
= fget(lsl
.sl_fd
);
1987 /* O_WRONLY or O_RDWR */
1988 if ((file2
->f_flags
& O_ACCMODE
) == O_RDONLY
) {
1993 if (lsl
.sl_flags
& SWAP_LAYOUTS_CLOSE
) {
1994 struct obd_client_handle
*och
= NULL
;
1995 struct ll_inode_info
*lli
;
1996 struct inode
*inode2
;
1998 if (lsl
.sl_flags
!= SWAP_LAYOUTS_CLOSE
) {
2003 lli
= ll_i2info(inode
);
2004 mutex_lock(&lli
->lli_och_mutex
);
2005 if (fd
->fd_lease_och
) {
2006 och
= fd
->fd_lease_och
;
2007 fd
->fd_lease_och
= NULL
;
2009 mutex_unlock(&lli
->lli_och_mutex
);
2014 inode2
= file_inode(file2
);
2015 rc
= ll_swap_layouts_close(och
, inode
, inode2
);
2017 rc
= ll_swap_layouts(file
, file2
, &lsl
);
2023 case LL_IOC_LOV_GETSTRIPE
:
2024 return ll_file_getstripe(inode
,
2025 (struct lov_user_md __user
*)arg
);
2026 case FSFILT_IOC_GETFLAGS
:
2027 case FSFILT_IOC_SETFLAGS
:
2028 return ll_iocontrol(inode
, file
, cmd
, arg
);
2029 case FSFILT_IOC_GETVERSION_OLD
:
2030 case FSFILT_IOC_GETVERSION
:
2031 return put_user(inode
->i_generation
, (int __user
*)arg
);
2032 case LL_IOC_GROUP_LOCK
:
2033 return ll_get_grouplock(inode
, file
, arg
);
2034 case LL_IOC_GROUP_UNLOCK
:
2035 return ll_put_grouplock(inode
, file
, arg
);
2036 case IOC_OBD_STATFS
:
2037 return ll_obd_statfs(inode
, (void __user
*)arg
);
2039 /* We need to special case any other ioctls we want to handle,
2040 * to send them to the MDS/OST as appropriate and to properly
2041 * network encode the arg field.
2042 case FSFILT_IOC_SETVERSION_OLD:
2043 case FSFILT_IOC_SETVERSION:
2045 case LL_IOC_FLUSHCTX
:
2046 return ll_flush_ctx(inode
);
2047 case LL_IOC_PATH2FID
: {
2048 if (copy_to_user((void __user
*)arg
, ll_inode2fid(inode
),
2049 sizeof(struct lu_fid
)))
2054 case LL_IOC_GETPARENT
:
2055 return ll_getparent(file
, (struct getparent __user
*)arg
);
2056 case OBD_IOC_FID2PATH
:
2057 return ll_fid2path(inode
, (void __user
*)arg
);
2058 case LL_IOC_DATA_VERSION
: {
2059 struct ioc_data_version idv
;
2062 if (copy_from_user(&idv
, (char __user
*)arg
, sizeof(idv
)))
2065 idv
.idv_flags
&= LL_DV_RD_FLUSH
| LL_DV_WR_FLUSH
;
2066 rc
= ll_data_version(inode
, &idv
.idv_version
, idv
.idv_flags
);
2067 if (rc
== 0 && copy_to_user((char __user
*)arg
, &idv
,
2074 case LL_IOC_GET_MDTIDX
: {
2077 mdtidx
= ll_get_mdt_idx(inode
);
2081 if (put_user(mdtidx
, (int __user
*)arg
))
2086 case OBD_IOC_GETDTNAME
:
2087 case OBD_IOC_GETMDNAME
:
2088 return ll_get_obd_name(inode
, cmd
, arg
);
2089 case LL_IOC_HSM_STATE_GET
: {
2090 struct md_op_data
*op_data
;
2091 struct hsm_user_state
*hus
;
2094 hus
= kzalloc(sizeof(*hus
), GFP_NOFS
);
2098 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
2099 LUSTRE_OPC_ANY
, hus
);
2100 if (IS_ERR(op_data
)) {
2102 return PTR_ERR(op_data
);
2105 rc
= obd_iocontrol(cmd
, ll_i2mdexp(inode
), sizeof(*op_data
),
2108 if (copy_to_user((void __user
*)arg
, hus
, sizeof(*hus
)))
2111 ll_finish_md_op_data(op_data
);
2115 case LL_IOC_HSM_STATE_SET
: {
2116 struct hsm_state_set
*hss
;
2119 hss
= memdup_user((char __user
*)arg
, sizeof(*hss
));
2121 return PTR_ERR(hss
);
2123 rc
= ll_hsm_state_set(inode
, hss
);
2128 case LL_IOC_HSM_ACTION
: {
2129 struct md_op_data
*op_data
;
2130 struct hsm_current_action
*hca
;
2133 hca
= kzalloc(sizeof(*hca
), GFP_NOFS
);
2137 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
2138 LUSTRE_OPC_ANY
, hca
);
2139 if (IS_ERR(op_data
)) {
2141 return PTR_ERR(op_data
);
2144 rc
= obd_iocontrol(cmd
, ll_i2mdexp(inode
), sizeof(*op_data
),
2147 if (copy_to_user((char __user
*)arg
, hca
, sizeof(*hca
)))
2150 ll_finish_md_op_data(op_data
);
2154 case LL_IOC_SET_LEASE
: {
2155 struct ll_inode_info
*lli
= ll_i2info(inode
);
2156 struct obd_client_handle
*och
= NULL
;
2161 case LL_LEASE_WRLCK
:
2162 if (!(file
->f_mode
& FMODE_WRITE
))
2164 fmode
= FMODE_WRITE
;
2166 case LL_LEASE_RDLCK
:
2167 if (!(file
->f_mode
& FMODE_READ
))
2171 case LL_LEASE_UNLCK
:
2172 mutex_lock(&lli
->lli_och_mutex
);
2173 if (fd
->fd_lease_och
) {
2174 och
= fd
->fd_lease_och
;
2175 fd
->fd_lease_och
= NULL
;
2177 mutex_unlock(&lli
->lli_och_mutex
);
2182 fmode
= och
->och_flags
;
2183 rc
= ll_lease_close(och
, inode
, &lease_broken
);
2190 return ll_lease_type_from_fmode(fmode
);
2195 CDEBUG(D_INODE
, "Set lease with mode %u\n", fmode
);
2197 /* apply for lease */
2198 och
= ll_lease_open(inode
, file
, fmode
, 0);
2200 return PTR_ERR(och
);
2203 mutex_lock(&lli
->lli_och_mutex
);
2204 if (!fd
->fd_lease_och
) {
2205 fd
->fd_lease_och
= och
;
2208 mutex_unlock(&lli
->lli_och_mutex
);
2210 /* impossible now that only excl is supported for now */
2211 ll_lease_close(och
, inode
, &lease_broken
);
2216 case LL_IOC_GET_LEASE
: {
2217 struct ll_inode_info
*lli
= ll_i2info(inode
);
2218 struct ldlm_lock
*lock
= NULL
;
2221 mutex_lock(&lli
->lli_och_mutex
);
2222 if (fd
->fd_lease_och
) {
2223 struct obd_client_handle
*och
= fd
->fd_lease_och
;
2225 lock
= ldlm_handle2lock(&och
->och_lease_handle
);
2227 lock_res_and_lock(lock
);
2228 if (!ldlm_is_cancel(lock
))
2229 fmode
= och
->och_flags
;
2230 unlock_res_and_lock(lock
);
2231 LDLM_LOCK_PUT(lock
);
2234 mutex_unlock(&lli
->lli_och_mutex
);
2235 return ll_lease_type_from_fmode(fmode
);
2237 case LL_IOC_HSM_IMPORT
: {
2238 struct hsm_user_import
*hui
;
2240 hui
= memdup_user((void __user
*)arg
, sizeof(*hui
));
2242 return PTR_ERR(hui
);
2244 rc
= ll_hsm_import(inode
, file
, hui
);
2252 if (ll_iocontrol_call(inode
, file
, cmd
, arg
, &err
) ==
2256 return obd_iocontrol(cmd
, ll_i2dtexp(inode
), 0, NULL
,
2257 (void __user
*)arg
);
2262 static loff_t
ll_file_seek(struct file
*file
, loff_t offset
, int origin
)
2264 struct inode
*inode
= file_inode(file
);
2265 loff_t retval
, eof
= 0;
2267 retval
= offset
+ ((origin
== SEEK_END
) ? i_size_read(inode
) :
2268 (origin
== SEEK_CUR
) ? file
->f_pos
: 0);
2269 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p), to=%llu=%#llx(%d)\n",
2270 PFID(ll_inode2fid(inode
)), inode
, retval
, retval
, origin
);
2271 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_LLSEEK
, 1);
2273 if (origin
== SEEK_END
|| origin
== SEEK_HOLE
|| origin
== SEEK_DATA
) {
2274 retval
= ll_glimpse_size(inode
);
2277 eof
= i_size_read(inode
);
2280 return generic_file_llseek_size(file
, offset
, origin
,
2281 ll_file_maxbytes(inode
), eof
);
2284 static int ll_flush(struct file
*file
, fl_owner_t id
)
2286 struct inode
*inode
= file_inode(file
);
2287 struct ll_inode_info
*lli
= ll_i2info(inode
);
2288 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
2291 LASSERT(!S_ISDIR(inode
->i_mode
));
2293 /* catch async errors that were recorded back when async writeback
2294 * failed for pages in this mapping.
2296 rc
= lli
->lli_async_rc
;
2297 lli
->lli_async_rc
= 0;
2298 if (lli
->lli_clob
) {
2299 err
= lov_read_and_clear_async_rc(lli
->lli_clob
);
2304 /* The application has been told about write failure already.
2305 * Do not report failure again.
2307 if (fd
->fd_write_failed
)
2309 return rc
? -EIO
: 0;
2313 * Called to make sure a portion of file has been written out.
2314 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2316 * Return how many pages have been written.
2318 int cl_sync_file_range(struct inode
*inode
, loff_t start
, loff_t end
,
2319 enum cl_fsync_mode mode
, int ignore_layout
)
2323 struct cl_fsync_io
*fio
;
2327 if (mode
!= CL_FSYNC_NONE
&& mode
!= CL_FSYNC_LOCAL
&&
2328 mode
!= CL_FSYNC_DISCARD
&& mode
!= CL_FSYNC_ALL
)
2331 env
= cl_env_get(&refcheck
);
2333 return PTR_ERR(env
);
2335 io
= vvp_env_thread_io(env
);
2336 io
->ci_obj
= ll_i2info(inode
)->lli_clob
;
2337 io
->ci_ignore_layout
= ignore_layout
;
2339 /* initialize parameters for sync */
2340 fio
= &io
->u
.ci_fsync
;
2341 fio
->fi_start
= start
;
2343 fio
->fi_fid
= ll_inode2fid(inode
);
2344 fio
->fi_mode
= mode
;
2345 fio
->fi_nr_written
= 0;
2347 if (cl_io_init(env
, io
, CIT_FSYNC
, io
->ci_obj
) == 0)
2348 result
= cl_io_loop(env
, io
);
2350 result
= io
->ci_result
;
2352 result
= fio
->fi_nr_written
;
2353 cl_io_fini(env
, io
);
2354 cl_env_put(env
, &refcheck
);
2359 int ll_fsync(struct file
*file
, loff_t start
, loff_t end
, int datasync
)
2361 struct inode
*inode
= file_inode(file
);
2362 struct ll_inode_info
*lli
= ll_i2info(inode
);
2363 struct ptlrpc_request
*req
;
2366 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p)\n",
2367 PFID(ll_inode2fid(inode
)), inode
);
2368 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_FSYNC
, 1);
2370 rc
= file_write_and_wait_range(file
, start
, end
);
2373 /* catch async errors that were recorded back when async writeback
2374 * failed for pages in this mapping.
2376 if (!S_ISDIR(inode
->i_mode
)) {
2377 err
= lli
->lli_async_rc
;
2378 lli
->lli_async_rc
= 0;
2381 if (lli
->lli_clob
) {
2382 err
= lov_read_and_clear_async_rc(lli
->lli_clob
);
2388 err
= md_sync(ll_i2sbi(inode
)->ll_md_exp
, ll_inode2fid(inode
), &req
);
2392 ptlrpc_req_finished(req
);
2394 if (S_ISREG(inode
->i_mode
)) {
2395 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
2397 err
= cl_sync_file_range(inode
, start
, end
, CL_FSYNC_ALL
, 0);
2398 if (rc
== 0 && err
< 0)
2401 fd
->fd_write_failed
= true;
2403 fd
->fd_write_failed
= false;
2406 inode_unlock(inode
);
2411 ll_file_flock(struct file
*file
, int cmd
, struct file_lock
*file_lock
)
2413 struct inode
*inode
= file_inode(file
);
2414 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
2415 struct ldlm_enqueue_info einfo
= {
2416 .ei_type
= LDLM_FLOCK
,
2417 .ei_cb_cp
= ldlm_flock_completion_ast
,
2418 .ei_cbdata
= file_lock
,
2420 struct md_op_data
*op_data
;
2421 struct lustre_handle lockh
= {0};
2422 union ldlm_policy_data flock
= { { 0 } };
2423 int fl_type
= file_lock
->fl_type
;
2428 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
" file_lock=%p\n",
2429 PFID(ll_inode2fid(inode
)), file_lock
);
2431 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_FLOCK
, 1);
2433 if (file_lock
->fl_flags
& FL_FLOCK
)
2434 LASSERT((cmd
== F_SETLKW
) || (cmd
== F_SETLK
));
2435 else if (!(file_lock
->fl_flags
& FL_POSIX
))
2438 flock
.l_flock
.owner
= (unsigned long)file_lock
->fl_owner
;
2439 flock
.l_flock
.pid
= file_lock
->fl_pid
;
2440 flock
.l_flock
.start
= file_lock
->fl_start
;
2441 flock
.l_flock
.end
= file_lock
->fl_end
;
2443 /* Somewhat ugly workaround for svc lockd.
2444 * lockd installs custom fl_lmops->lm_compare_owner that checks
2445 * for the fl_owner to be the same (which it always is on local node
2446 * I guess between lockd processes) and then compares pid.
2447 * As such we assign pid to the owner field to make it all work,
2448 * conflict with normal locks is unlikely since pid space and
2449 * pointer space for current->files are not intersecting
2451 if (file_lock
->fl_lmops
&& file_lock
->fl_lmops
->lm_compare_owner
)
2452 flock
.l_flock
.owner
= (unsigned long)file_lock
->fl_pid
;
2456 einfo
.ei_mode
= LCK_PR
;
2459 /* An unlock request may or may not have any relation to
2460 * existing locks so we may not be able to pass a lock handle
2461 * via a normal ldlm_lock_cancel() request. The request may even
2462 * unlock a byte range in the middle of an existing lock. In
2463 * order to process an unlock request we need all of the same
2464 * information that is given with a normal read or write record
2465 * lock request. To avoid creating another ldlm unlock (cancel)
2466 * message we'll treat a LCK_NL flock request as an unlock.
2468 einfo
.ei_mode
= LCK_NL
;
2471 einfo
.ei_mode
= LCK_PW
;
2474 CDEBUG(D_INFO
, "Unknown fcntl lock type: %d\n", fl_type
);
2489 flags
= LDLM_FL_BLOCK_NOWAIT
;
2495 flags
= LDLM_FL_TEST_LOCK
;
2498 CERROR("unknown fcntl lock command: %d\n", cmd
);
2503 * Save the old mode so that if the mode in the lock changes we
2504 * can decrement the appropriate reader or writer refcount.
2506 file_lock
->fl_type
= einfo
.ei_mode
;
2508 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
2509 LUSTRE_OPC_ANY
, NULL
);
2510 if (IS_ERR(op_data
))
2511 return PTR_ERR(op_data
);
2513 CDEBUG(D_DLMTRACE
, "inode=" DFID
", pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2514 PFID(ll_inode2fid(inode
)), flock
.l_flock
.pid
, flags
,
2515 einfo
.ei_mode
, flock
.l_flock
.start
, flock
.l_flock
.end
);
2517 rc
= md_enqueue(sbi
->ll_md_exp
, &einfo
, &flock
, NULL
, op_data
, &lockh
,
2520 /* Restore the file lock type if not TEST lock. */
2521 if (!(flags
& LDLM_FL_TEST_LOCK
))
2522 file_lock
->fl_type
= fl_type
;
2524 if ((rc
== 0 || file_lock
->fl_type
== F_UNLCK
) &&
2525 !(flags
& LDLM_FL_TEST_LOCK
))
2526 rc2
= locks_lock_file_wait(file
, file_lock
);
2528 if (rc2
&& file_lock
->fl_type
!= F_UNLCK
) {
2529 einfo
.ei_mode
= LCK_NL
;
2530 md_enqueue(sbi
->ll_md_exp
, &einfo
, &flock
, NULL
, op_data
,
2535 ll_finish_md_op_data(op_data
);
2540 int ll_get_fid_by_name(struct inode
*parent
, const char *name
,
2541 int namelen
, struct lu_fid
*fid
,
2542 struct inode
**inode
)
2544 struct md_op_data
*op_data
= NULL
;
2545 struct ptlrpc_request
*req
;
2546 struct mdt_body
*body
;
2549 op_data
= ll_prep_md_op_data(NULL
, parent
, NULL
, name
, namelen
, 0,
2550 LUSTRE_OPC_ANY
, NULL
);
2551 if (IS_ERR(op_data
))
2552 return PTR_ERR(op_data
);
2554 op_data
->op_valid
= OBD_MD_FLID
| OBD_MD_FLTYPE
;
2555 rc
= md_getattr_name(ll_i2sbi(parent
)->ll_md_exp
, op_data
, &req
);
2556 ll_finish_md_op_data(op_data
);
2560 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
2566 *fid
= body
->mbo_fid1
;
2569 rc
= ll_prep_inode(inode
, req
, parent
->i_sb
, NULL
);
2571 ptlrpc_req_finished(req
);
2575 int ll_migrate(struct inode
*parent
, struct file
*file
, int mdtidx
,
2576 const char *name
, int namelen
)
2578 struct ptlrpc_request
*request
= NULL
;
2579 struct obd_client_handle
*och
= NULL
;
2580 struct inode
*child_inode
= NULL
;
2581 struct dentry
*dchild
= NULL
;
2582 struct md_op_data
*op_data
;
2583 struct mdt_body
*body
;
2584 u64 data_version
= 0;
2588 CDEBUG(D_VFSTRACE
, "migrate %s under " DFID
" to MDT%d\n",
2589 name
, PFID(ll_inode2fid(parent
)), mdtidx
);
2591 op_data
= ll_prep_md_op_data(NULL
, parent
, NULL
, name
, namelen
,
2592 0, LUSTRE_OPC_ANY
, NULL
);
2593 if (IS_ERR(op_data
))
2594 return PTR_ERR(op_data
);
2596 /* Get child FID first */
2597 qstr
.hash
= full_name_hash(parent
, name
, namelen
);
2600 dchild
= d_lookup(file_dentry(file
), &qstr
);
2602 op_data
->op_fid3
= *ll_inode2fid(dchild
->d_inode
);
2603 if (dchild
->d_inode
)
2604 child_inode
= igrab(dchild
->d_inode
);
2609 rc
= ll_get_fid_by_name(parent
, name
, namelen
,
2610 &op_data
->op_fid3
, &child_inode
);
2620 inode_lock(child_inode
);
2621 op_data
->op_fid3
= *ll_inode2fid(child_inode
);
2622 if (!fid_is_sane(&op_data
->op_fid3
)) {
2623 CERROR("%s: migrate %s, but fid " DFID
" is insane\n",
2624 ll_get_fsname(parent
->i_sb
, NULL
, 0), name
,
2625 PFID(&op_data
->op_fid3
));
2630 rc
= ll_get_mdt_idx_by_fid(ll_i2sbi(parent
), &op_data
->op_fid3
);
2635 CDEBUG(D_INFO
, "%s: " DFID
" is already on MDT%d.\n", name
,
2636 PFID(&op_data
->op_fid3
), mdtidx
);
2641 if (S_ISREG(child_inode
->i_mode
)) {
2642 och
= ll_lease_open(child_inode
, NULL
, FMODE_WRITE
, 0);
2649 rc
= ll_data_version(child_inode
, &data_version
,
2654 op_data
->op_handle
= och
->och_fh
;
2655 op_data
->op_data
= och
->och_mod
;
2656 op_data
->op_data_version
= data_version
;
2657 op_data
->op_lease_handle
= och
->och_lease_handle
;
2658 op_data
->op_bias
|= MDS_RENAME_MIGRATE
;
2661 op_data
->op_mds
= mdtidx
;
2662 op_data
->op_cli_flags
= CLI_MIGRATE
;
2663 rc
= md_rename(ll_i2sbi(parent
)->ll_md_exp
, op_data
, name
,
2664 namelen
, name
, namelen
, &request
);
2667 ll_update_times(request
, parent
);
2669 body
= req_capsule_server_get(&request
->rq_pill
, &RMF_MDT_BODY
);
2673 * If the server does release layout lock, then we cleanup
2674 * the client och here, otherwise release it in out_close:
2676 if (och
&& body
->mbo_valid
& OBD_MD_CLOSE_INTENT_EXECED
) {
2677 obd_mod_put(och
->och_mod
);
2678 md_clear_open_replay_data(ll_i2sbi(parent
)->ll_md_exp
,
2680 och
->och_fh
.cookie
= DEAD_HANDLE_MAGIC
;
2687 ptlrpc_req_finished(request
);
2691 /* Try again if the file layout has changed. */
2692 if (rc
== -EAGAIN
&& S_ISREG(child_inode
->i_mode
))
2696 if (och
) /* close the file */
2697 ll_lease_close(och
, child_inode
, NULL
);
2699 clear_nlink(child_inode
);
2701 inode_unlock(child_inode
);
2704 ll_finish_md_op_data(op_data
);
2709 ll_file_noflock(struct file
*file
, int cmd
, struct file_lock
*file_lock
)
2715 * test if some locks matching bits and l_req_mode are acquired
2716 * - bits can be in different locks
2717 * - if found clear the common lock bits in *bits
2718 * - the bits not found, are kept in *bits
2720 * \param bits [IN] searched lock bits [IN]
2721 * \param l_req_mode [IN] searched lock mode
2722 * \retval boolean, true iff all bits are found
2724 int ll_have_md_lock(struct inode
*inode
, __u64
*bits
,
2725 enum ldlm_mode l_req_mode
)
2727 struct lustre_handle lockh
;
2728 union ldlm_policy_data policy
;
2729 enum ldlm_mode mode
= (l_req_mode
== LCK_MINMODE
) ?
2730 (LCK_CR
| LCK_CW
| LCK_PR
| LCK_PW
) : l_req_mode
;
2738 fid
= &ll_i2info(inode
)->lli_fid
;
2739 CDEBUG(D_INFO
, "trying to match res " DFID
" mode %s\n", PFID(fid
),
2740 ldlm_lockname
[mode
]);
2742 flags
= LDLM_FL_BLOCK_GRANTED
| LDLM_FL_CBPENDING
| LDLM_FL_TEST_LOCK
;
2743 for (i
= 0; i
<= MDS_INODELOCK_MAXSHIFT
&& *bits
!= 0; i
++) {
2744 policy
.l_inodebits
.bits
= *bits
& (1 << i
);
2745 if (policy
.l_inodebits
.bits
== 0)
2748 if (md_lock_match(ll_i2mdexp(inode
), flags
, fid
, LDLM_IBITS
,
2749 &policy
, mode
, &lockh
)) {
2750 struct ldlm_lock
*lock
;
2752 lock
= ldlm_handle2lock(&lockh
);
2755 ~(lock
->l_policy_data
.l_inodebits
.bits
);
2756 LDLM_LOCK_PUT(lock
);
2758 *bits
&= ~policy
.l_inodebits
.bits
;
2765 enum ldlm_mode
ll_take_md_lock(struct inode
*inode
, __u64 bits
,
2766 struct lustre_handle
*lockh
, __u64 flags
,
2767 enum ldlm_mode mode
)
2769 union ldlm_policy_data policy
= { .l_inodebits
= { bits
} };
2772 fid
= &ll_i2info(inode
)->lli_fid
;
2773 CDEBUG(D_INFO
, "trying to match res " DFID
"\n", PFID(fid
));
2775 return md_lock_match(ll_i2mdexp(inode
), flags
| LDLM_FL_BLOCK_GRANTED
,
2776 fid
, LDLM_IBITS
, &policy
, mode
, lockh
);
2779 static int ll_inode_revalidate_fini(struct inode
*inode
, int rc
)
2781 /* Already unlinked. Just update nlink and return success */
2782 if (rc
== -ENOENT
) {
2784 /* If it is striped directory, and there is bad stripe
2785 * Let's revalidate the dentry again, instead of returning
2788 if (S_ISDIR(inode
->i_mode
) && ll_i2info(inode
)->lli_lsm_md
)
2791 /* This path cannot be hit for regular files unless in
2792 * case of obscure races, so no need to validate size.
2794 if (!S_ISREG(inode
->i_mode
) && !S_ISDIR(inode
->i_mode
))
2796 } else if (rc
!= 0) {
2797 CDEBUG_LIMIT((rc
== -EACCES
|| rc
== -EIDRM
) ? D_INFO
: D_ERROR
,
2798 "%s: revalidate FID " DFID
" error: rc = %d\n",
2799 ll_get_fsname(inode
->i_sb
, NULL
, 0),
2800 PFID(ll_inode2fid(inode
)), rc
);
2806 static int __ll_inode_revalidate(struct dentry
*dentry
, __u64 ibits
)
2808 struct inode
*inode
= d_inode(dentry
);
2809 struct ptlrpc_request
*req
= NULL
;
2810 struct obd_export
*exp
;
2813 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p),name=%pd\n",
2814 PFID(ll_inode2fid(inode
)), inode
, dentry
);
2816 exp
= ll_i2mdexp(inode
);
2818 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2819 * But under CMD case, it caused some lock issues, should be fixed
2820 * with new CMD ibits lock. See bug 12718
2822 if (exp_connect_flags(exp
) & OBD_CONNECT_ATTRFID
) {
2823 struct lookup_intent oit
= { .it_op
= IT_GETATTR
};
2824 struct md_op_data
*op_data
;
2826 if (ibits
== MDS_INODELOCK_LOOKUP
)
2827 oit
.it_op
= IT_LOOKUP
;
2829 /* Call getattr by fid, so do not provide name at all. */
2830 op_data
= ll_prep_md_op_data(NULL
, inode
,
2832 LUSTRE_OPC_ANY
, NULL
);
2833 if (IS_ERR(op_data
))
2834 return PTR_ERR(op_data
);
2836 rc
= md_intent_lock(exp
, op_data
, &oit
, &req
,
2837 &ll_md_blocking_ast
, 0);
2838 ll_finish_md_op_data(op_data
);
2840 rc
= ll_inode_revalidate_fini(inode
, rc
);
2844 rc
= ll_revalidate_it_finish(req
, &oit
, inode
);
2846 ll_intent_release(&oit
);
2850 /* Unlinked? Unhash dentry, so it is not picked up later by
2851 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2852 * here to preserve get_cwd functionality on 2.6.
2855 if (!d_inode(dentry
)->i_nlink
) {
2856 spin_lock(&inode
->i_lock
);
2857 d_lustre_invalidate(dentry
, 0);
2858 spin_unlock(&inode
->i_lock
);
2861 ll_lookup_finish_locks(&oit
, inode
);
2862 } else if (!ll_have_md_lock(d_inode(dentry
), &ibits
, LCK_MINMODE
)) {
2863 struct ll_sb_info
*sbi
= ll_i2sbi(d_inode(dentry
));
2864 u64 valid
= OBD_MD_FLGETATTR
;
2865 struct md_op_data
*op_data
;
2868 if (S_ISREG(inode
->i_mode
)) {
2869 rc
= ll_get_default_mdsize(sbi
, &ealen
);
2872 valid
|= OBD_MD_FLEASIZE
| OBD_MD_FLMODEASIZE
;
2875 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
,
2876 0, ealen
, LUSTRE_OPC_ANY
,
2878 if (IS_ERR(op_data
))
2879 return PTR_ERR(op_data
);
2881 op_data
->op_valid
= valid
;
2882 rc
= md_getattr(sbi
->ll_md_exp
, op_data
, &req
);
2883 ll_finish_md_op_data(op_data
);
2885 return ll_inode_revalidate_fini(inode
, rc
);
2887 rc
= ll_prep_inode(&inode
, req
, NULL
, NULL
);
2890 ptlrpc_req_finished(req
);
2894 static int ll_merge_md_attr(struct inode
*inode
)
2896 struct cl_attr attr
= { 0 };
2899 LASSERT(ll_i2info(inode
)->lli_lsm_md
);
2900 rc
= md_merge_attr(ll_i2mdexp(inode
), ll_i2info(inode
)->lli_lsm_md
,
2901 &attr
, ll_md_blocking_ast
);
2905 set_nlink(inode
, attr
.cat_nlink
);
2906 inode
->i_blocks
= attr
.cat_blocks
;
2907 i_size_write(inode
, attr
.cat_size
);
2909 ll_i2info(inode
)->lli_atime
= attr
.cat_atime
;
2910 ll_i2info(inode
)->lli_mtime
= attr
.cat_mtime
;
2911 ll_i2info(inode
)->lli_ctime
= attr
.cat_ctime
;
2916 static int ll_inode_revalidate(struct dentry
*dentry
, __u64 ibits
)
2918 struct inode
*inode
= d_inode(dentry
);
2921 rc
= __ll_inode_revalidate(dentry
, ibits
);
2925 /* if object isn't regular file, don't validate size */
2926 if (!S_ISREG(inode
->i_mode
)) {
2927 if (S_ISDIR(inode
->i_mode
) &&
2928 ll_i2info(inode
)->lli_lsm_md
) {
2929 rc
= ll_merge_md_attr(inode
);
2934 LTIME_S(inode
->i_atime
) = ll_i2info(inode
)->lli_atime
;
2935 LTIME_S(inode
->i_mtime
) = ll_i2info(inode
)->lli_mtime
;
2936 LTIME_S(inode
->i_ctime
) = ll_i2info(inode
)->lli_ctime
;
2938 struct ll_inode_info
*lli
= ll_i2info(inode
);
2940 /* In case of restore, the MDT has the right size and has
2941 * already send it back without granting the layout lock,
2942 * inode is up-to-date so glimpse is useless.
2943 * Also to glimpse we need the layout, in case of a running
2944 * restore the MDT holds the layout lock so the glimpse will
2945 * block up to the end of restore (getattr will block)
2947 if (!test_bit(LLIF_FILE_RESTORING
, &lli
->lli_flags
))
2948 rc
= ll_glimpse_size(inode
);
2953 int ll_getattr(const struct path
*path
, struct kstat
*stat
,
2954 u32 request_mask
, unsigned int flags
)
2956 struct inode
*inode
= d_inode(path
->dentry
);
2957 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
2958 struct ll_inode_info
*lli
= ll_i2info(inode
);
2961 res
= ll_inode_revalidate(path
->dentry
,
2962 MDS_INODELOCK_UPDATE
| MDS_INODELOCK_LOOKUP
);
2963 ll_stats_ops_tally(sbi
, LPROC_LL_GETATTR
, 1);
2968 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY
, 30);
2970 stat
->dev
= inode
->i_sb
->s_dev
;
2971 if (ll_need_32bit_api(sbi
))
2972 stat
->ino
= cl_fid_build_ino(&lli
->lli_fid
, 1);
2974 stat
->ino
= inode
->i_ino
;
2975 stat
->mode
= inode
->i_mode
;
2976 stat
->uid
= inode
->i_uid
;
2977 stat
->gid
= inode
->i_gid
;
2978 stat
->rdev
= inode
->i_rdev
;
2979 stat
->atime
= inode
->i_atime
;
2980 stat
->mtime
= inode
->i_mtime
;
2981 stat
->ctime
= inode
->i_ctime
;
2982 stat
->blksize
= 1 << inode
->i_blkbits
;
2984 stat
->nlink
= inode
->i_nlink
;
2985 stat
->size
= i_size_read(inode
);
2986 stat
->blocks
= inode
->i_blocks
;
2991 static int ll_fiemap(struct inode
*inode
, struct fiemap_extent_info
*fieinfo
,
2992 __u64 start
, __u64 len
)
2996 struct fiemap
*fiemap
;
2997 unsigned int extent_count
= fieinfo
->fi_extents_max
;
2999 num_bytes
= sizeof(*fiemap
) + (extent_count
*
3000 sizeof(struct fiemap_extent
));
3001 fiemap
= libcfs_kvzalloc(num_bytes
, GFP_NOFS
);
3005 fiemap
->fm_flags
= fieinfo
->fi_flags
;
3006 fiemap
->fm_extent_count
= fieinfo
->fi_extents_max
;
3007 fiemap
->fm_start
= start
;
3008 fiemap
->fm_length
= len
;
3010 if (extent_count
> 0 &&
3011 copy_from_user(&fiemap
->fm_extents
[0], fieinfo
->fi_extents_start
,
3012 sizeof(struct fiemap_extent
))) {
3017 rc
= ll_do_fiemap(inode
, fiemap
, num_bytes
);
3019 fieinfo
->fi_flags
= fiemap
->fm_flags
;
3020 fieinfo
->fi_extents_mapped
= fiemap
->fm_mapped_extents
;
3021 if (extent_count
> 0 &&
3022 copy_to_user(fieinfo
->fi_extents_start
, &fiemap
->fm_extents
[0],
3023 fiemap
->fm_mapped_extents
*
3024 sizeof(struct fiemap_extent
))) {
3033 struct posix_acl
*ll_get_acl(struct inode
*inode
, int type
)
3035 struct ll_inode_info
*lli
= ll_i2info(inode
);
3036 struct posix_acl
*acl
= NULL
;
3038 spin_lock(&lli
->lli_lock
);
3039 /* VFS' acl_permission_check->check_acl will release the refcount */
3040 acl
= posix_acl_dup(lli
->lli_posix_acl
);
3041 spin_unlock(&lli
->lli_lock
);
3046 int ll_inode_permission(struct inode
*inode
, int mask
)
3048 struct ll_sb_info
*sbi
;
3049 struct root_squash_info
*squash
;
3050 const struct cred
*old_cred
= NULL
;
3051 struct cred
*cred
= NULL
;
3052 bool squash_id
= false;
3056 if (mask
& MAY_NOT_BLOCK
)
3059 /* as root inode are NOT getting validated in lookup operation,
3060 * need to do it before permission check.
3063 if (is_root_inode(inode
)) {
3064 rc
= __ll_inode_revalidate(inode
->i_sb
->s_root
,
3065 MDS_INODELOCK_LOOKUP
);
3070 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p), inode mode %x mask %o\n",
3071 PFID(ll_inode2fid(inode
)), inode
, inode
->i_mode
, mask
);
3073 /* squash fsuid/fsgid if needed */
3074 sbi
= ll_i2sbi(inode
);
3075 squash
= &sbi
->ll_squash
;
3076 if (unlikely(squash
->rsi_uid
&&
3077 uid_eq(current_fsuid(), GLOBAL_ROOT_UID
) &&
3078 !(sbi
->ll_flags
& LL_SBI_NOROOTSQUASH
))) {
3083 CDEBUG(D_OTHER
, "squash creds (%d:%d)=>(%d:%d)\n",
3084 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3085 squash
->rsi_uid
, squash
->rsi_gid
);
3088 * update current process's credentials
3091 cred
= prepare_creds();
3095 cred
->fsuid
= make_kuid(&init_user_ns
, squash
->rsi_uid
);
3096 cred
->fsgid
= make_kgid(&init_user_ns
, squash
->rsi_gid
);
3097 for (cap
= 0; cap
< sizeof(cfs_cap_t
) * 8; cap
++) {
3098 if ((1 << cap
) & CFS_CAP_FS_MASK
)
3099 cap_lower(cred
->cap_effective
, cap
);
3101 old_cred
= override_creds(cred
);
3104 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_INODE_PERM
, 1);
3105 rc
= generic_permission(inode
, mask
);
3107 /* restore current process's credentials and FS capability */
3109 revert_creds(old_cred
);
3116 /* -o localflock - only provides locally consistent flock locks */
3117 const struct file_operations ll_file_operations
= {
3118 .read_iter
= ll_file_read_iter
,
3119 .write_iter
= ll_file_write_iter
,
3120 .unlocked_ioctl
= ll_file_ioctl
,
3121 .open
= ll_file_open
,
3122 .release
= ll_file_release
,
3123 .mmap
= ll_file_mmap
,
3124 .llseek
= ll_file_seek
,
3125 .splice_read
= generic_file_splice_read
,
3130 const struct file_operations ll_file_operations_flock
= {
3131 .read_iter
= ll_file_read_iter
,
3132 .write_iter
= ll_file_write_iter
,
3133 .unlocked_ioctl
= ll_file_ioctl
,
3134 .open
= ll_file_open
,
3135 .release
= ll_file_release
,
3136 .mmap
= ll_file_mmap
,
3137 .llseek
= ll_file_seek
,
3138 .splice_read
= generic_file_splice_read
,
3141 .flock
= ll_file_flock
,
3142 .lock
= ll_file_flock
3145 /* These are for -o noflock - to return ENOSYS on flock calls */
3146 const struct file_operations ll_file_operations_noflock
= {
3147 .read_iter
= ll_file_read_iter
,
3148 .write_iter
= ll_file_write_iter
,
3149 .unlocked_ioctl
= ll_file_ioctl
,
3150 .open
= ll_file_open
,
3151 .release
= ll_file_release
,
3152 .mmap
= ll_file_mmap
,
3153 .llseek
= ll_file_seek
,
3154 .splice_read
= generic_file_splice_read
,
3157 .flock
= ll_file_noflock
,
3158 .lock
= ll_file_noflock
3161 const struct inode_operations ll_file_inode_operations
= {
3162 .setattr
= ll_setattr
,
3163 .getattr
= ll_getattr
,
3164 .permission
= ll_inode_permission
,
3165 .listxattr
= ll_listxattr
,
3166 .fiemap
= ll_fiemap
,
3167 .get_acl
= ll_get_acl
,
3170 /* dynamic ioctl number support routines */
3171 static struct llioc_ctl_data
{
3172 struct rw_semaphore ioc_sem
;
3173 struct list_head ioc_head
;
3175 __RWSEM_INITIALIZER(llioc
.ioc_sem
),
3176 LIST_HEAD_INIT(llioc
.ioc_head
)
3180 struct list_head iocd_list
;
3181 unsigned int iocd_size
;
3182 llioc_callback_t iocd_cb
;
3183 unsigned int iocd_count
;
3184 unsigned int iocd_cmd
[0];
3187 void *ll_iocontrol_register(llioc_callback_t cb
, int count
, unsigned int *cmd
)
3190 struct llioc_data
*in_data
= NULL
;
3192 if (!cb
|| !cmd
|| count
> LLIOC_MAX_CMD
|| count
< 0)
3195 size
= sizeof(*in_data
) + count
* sizeof(unsigned int);
3196 in_data
= kzalloc(size
, GFP_NOFS
);
3200 in_data
->iocd_size
= size
;
3201 in_data
->iocd_cb
= cb
;
3202 in_data
->iocd_count
= count
;
3203 memcpy(in_data
->iocd_cmd
, cmd
, sizeof(unsigned int) * count
);
3205 down_write(&llioc
.ioc_sem
);
3206 list_add_tail(&in_data
->iocd_list
, &llioc
.ioc_head
);
3207 up_write(&llioc
.ioc_sem
);
3211 EXPORT_SYMBOL(ll_iocontrol_register
);
3213 void ll_iocontrol_unregister(void *magic
)
3215 struct llioc_data
*tmp
;
3220 down_write(&llioc
.ioc_sem
);
3221 list_for_each_entry(tmp
, &llioc
.ioc_head
, iocd_list
) {
3223 list_del(&tmp
->iocd_list
);
3224 up_write(&llioc
.ioc_sem
);
3230 up_write(&llioc
.ioc_sem
);
3232 CWARN("didn't find iocontrol register block with magic: %p\n", magic
);
3234 EXPORT_SYMBOL(ll_iocontrol_unregister
);
3236 static enum llioc_iter
3237 ll_iocontrol_call(struct inode
*inode
, struct file
*file
,
3238 unsigned int cmd
, unsigned long arg
, int *rcp
)
3240 enum llioc_iter ret
= LLIOC_CONT
;
3241 struct llioc_data
*data
;
3242 int rc
= -EINVAL
, i
;
3244 down_read(&llioc
.ioc_sem
);
3245 list_for_each_entry(data
, &llioc
.ioc_head
, iocd_list
) {
3246 for (i
= 0; i
< data
->iocd_count
; i
++) {
3247 if (cmd
!= data
->iocd_cmd
[i
])
3250 ret
= data
->iocd_cb(inode
, file
, cmd
, arg
, data
, &rc
);
3254 if (ret
== LLIOC_STOP
)
3257 up_read(&llioc
.ioc_sem
);
3264 int ll_layout_conf(struct inode
*inode
, const struct cl_object_conf
*conf
)
3266 struct ll_inode_info
*lli
= ll_i2info(inode
);
3267 struct cl_object
*obj
= lli
->lli_clob
;
3275 env
= cl_env_get(&refcheck
);
3277 return PTR_ERR(env
);
3279 rc
= cl_conf_set(env
, obj
, conf
);
3283 if (conf
->coc_opc
== OBJECT_CONF_SET
) {
3284 struct ldlm_lock
*lock
= conf
->coc_lock
;
3285 struct cl_layout cl
= {
3290 LASSERT(ldlm_has_layout(lock
));
3292 /* it can only be allowed to match after layout is
3293 * applied to inode otherwise false layout would be
3294 * seen. Applying layout should happen before dropping
3297 ldlm_lock_allow_match(lock
);
3299 rc
= cl_object_layout_get(env
, obj
, &cl
);
3303 CDEBUG(D_VFSTRACE
, DFID
": layout version change: %u -> %u\n",
3304 PFID(&lli
->lli_fid
), ll_layout_version_get(lli
),
3306 ll_layout_version_set(lli
, cl
.cl_layout_gen
);
3309 cl_env_put(env
, &refcheck
);
3313 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3314 static int ll_layout_fetch(struct inode
*inode
, struct ldlm_lock
*lock
)
3317 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
3318 struct ptlrpc_request
*req
;
3319 struct mdt_body
*body
;
3325 CDEBUG(D_INODE
, DFID
" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3326 PFID(ll_inode2fid(inode
)), ldlm_is_lvb_ready(lock
),
3327 lock
->l_lvb_data
, lock
->l_lvb_len
);
3329 if (lock
->l_lvb_data
&& ldlm_is_lvb_ready(lock
))
3332 /* if layout lock was granted right away, the layout is returned
3333 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3334 * blocked and then granted via completion ast, we have to fetch
3335 * layout here. Please note that we can't use the LVB buffer in
3336 * completion AST because it doesn't have a large enough buffer
3338 rc
= ll_get_default_mdsize(sbi
, &lmmsize
);
3340 rc
= md_getxattr(sbi
->ll_md_exp
, ll_inode2fid(inode
),
3341 OBD_MD_FLXATTR
, XATTR_NAME_LOV
, NULL
, 0,
3346 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
3352 lmmsize
= body
->mbo_eadatasize
;
3353 if (lmmsize
== 0) /* empty layout */ {
3358 lmm
= req_capsule_server_sized_get(&req
->rq_pill
, &RMF_EADATA
, lmmsize
);
3364 lvbdata
= libcfs_kvzalloc(lmmsize
, GFP_NOFS
);
3370 memcpy(lvbdata
, lmm
, lmmsize
);
3371 lock_res_and_lock(lock
);
3372 if (lock
->l_lvb_data
)
3373 kvfree(lock
->l_lvb_data
);
3375 lock
->l_lvb_data
= lvbdata
;
3376 lock
->l_lvb_len
= lmmsize
;
3377 unlock_res_and_lock(lock
);
3380 ptlrpc_req_finished(req
);
3385 * Apply the layout to the inode. Layout lock is held and will be released
3388 static int ll_layout_lock_set(struct lustre_handle
*lockh
, enum ldlm_mode mode
,
3389 struct inode
*inode
)
3391 struct ll_inode_info
*lli
= ll_i2info(inode
);
3392 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
3393 struct ldlm_lock
*lock
;
3394 struct cl_object_conf conf
;
3397 bool wait_layout
= false;
3399 LASSERT(lustre_handle_is_used(lockh
));
3401 lock
= ldlm_handle2lock(lockh
);
3403 LASSERT(ldlm_has_layout(lock
));
3405 LDLM_DEBUG(lock
, "File " DFID
"(%p) being reconfigured",
3406 PFID(&lli
->lli_fid
), inode
);
3408 /* in case this is a caching lock and reinstate with new inode */
3409 md_set_lock_data(sbi
->ll_md_exp
, lockh
, inode
, NULL
);
3411 lock_res_and_lock(lock
);
3412 lvb_ready
= ldlm_is_lvb_ready(lock
);
3413 unlock_res_and_lock(lock
);
3414 /* checking lvb_ready is racy but this is okay. The worst case is
3415 * that multi processes may configure the file on the same time.
3422 rc
= ll_layout_fetch(inode
, lock
);
3426 /* for layout lock, lmm is returned in lock's lvb.
3427 * lvb_data is immutable if the lock is held so it's safe to access it
3430 * set layout to file. Unlikely this will fail as old layout was
3433 memset(&conf
, 0, sizeof(conf
));
3434 conf
.coc_opc
= OBJECT_CONF_SET
;
3435 conf
.coc_inode
= inode
;
3436 conf
.coc_lock
= lock
;
3437 conf
.u
.coc_layout
.lb_buf
= lock
->l_lvb_data
;
3438 conf
.u
.coc_layout
.lb_len
= lock
->l_lvb_len
;
3439 rc
= ll_layout_conf(inode
, &conf
);
3441 /* refresh layout failed, need to wait */
3442 wait_layout
= rc
== -EBUSY
;
3445 LDLM_LOCK_PUT(lock
);
3446 ldlm_lock_decref(lockh
, mode
);
3448 /* wait for IO to complete if it's still being used. */
3450 CDEBUG(D_INODE
, "%s: " DFID
"(%p) wait for layout reconf\n",
3451 ll_get_fsname(inode
->i_sb
, NULL
, 0),
3452 PFID(&lli
->lli_fid
), inode
);
3454 memset(&conf
, 0, sizeof(conf
));
3455 conf
.coc_opc
= OBJECT_CONF_WAIT
;
3456 conf
.coc_inode
= inode
;
3457 rc
= ll_layout_conf(inode
, &conf
);
3462 "%s: file=" DFID
" waiting layout return: %d.\n",
3463 ll_get_fsname(inode
->i_sb
, NULL
, 0),
3464 PFID(&lli
->lli_fid
), rc
);
3469 static int ll_layout_refresh_locked(struct inode
*inode
)
3471 struct ll_inode_info
*lli
= ll_i2info(inode
);
3472 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
3473 struct md_op_data
*op_data
;
3474 struct lookup_intent it
;
3475 struct lustre_handle lockh
;
3476 enum ldlm_mode mode
;
3477 struct ldlm_enqueue_info einfo
= {
3478 .ei_type
= LDLM_IBITS
,
3480 .ei_cb_bl
= &ll_md_blocking_ast
,
3481 .ei_cb_cp
= &ldlm_completion_ast
,
3486 /* mostly layout lock is caching on the local side, so try to match
3487 * it before grabbing layout lock mutex.
3489 mode
= ll_take_md_lock(inode
, MDS_INODELOCK_LAYOUT
, &lockh
, 0,
3490 LCK_CR
| LCK_CW
| LCK_PR
| LCK_PW
);
3491 if (mode
!= 0) { /* hit cached lock */
3492 rc
= ll_layout_lock_set(&lockh
, mode
, inode
);
3498 op_data
= ll_prep_md_op_data(NULL
, inode
, inode
, NULL
,
3499 0, 0, LUSTRE_OPC_ANY
, NULL
);
3500 if (IS_ERR(op_data
))
3501 return PTR_ERR(op_data
);
3503 /* have to enqueue one */
3504 memset(&it
, 0, sizeof(it
));
3505 it
.it_op
= IT_LAYOUT
;
3506 lockh
.cookie
= 0ULL;
3508 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file " DFID
"(%p)",
3509 ll_get_fsname(inode
->i_sb
, NULL
, 0),
3510 PFID(&lli
->lli_fid
), inode
);
3512 rc
= md_enqueue(sbi
->ll_md_exp
, &einfo
, NULL
, &it
, op_data
, &lockh
, 0);
3513 ptlrpc_req_finished(it
.it_request
);
3514 it
.it_request
= NULL
;
3516 ll_finish_md_op_data(op_data
);
3518 mode
= it
.it_lock_mode
;
3519 it
.it_lock_mode
= 0;
3520 ll_intent_drop_lock(&it
);
3523 /* set lock data in case this is a new lock */
3524 ll_set_lock_data(sbi
->ll_md_exp
, inode
, &it
, NULL
);
3525 rc
= ll_layout_lock_set(&lockh
, mode
, inode
);
3534 * This function checks if there exists a LAYOUT lock on the client side,
3535 * or enqueues it if it doesn't have one in cache.
3537 * This function will not hold layout lock so it may be revoked any time after
3538 * this function returns. Any operations depend on layout should be redone
3541 * This function should be called before lov_io_init() to get an uptodate
3542 * layout version, the caller should save the version number and after IO
3543 * is finished, this function should be called again to verify that layout
3544 * is not changed during IO time.
3546 int ll_layout_refresh(struct inode
*inode
, __u32
*gen
)
3548 struct ll_inode_info
*lli
= ll_i2info(inode
);
3549 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
3552 *gen
= ll_layout_version_get(lli
);
3553 if (!(sbi
->ll_flags
& LL_SBI_LAYOUT_LOCK
) || *gen
!= CL_LAYOUT_GEN_NONE
)
3557 LASSERT(fid_is_sane(ll_inode2fid(inode
)));
3558 LASSERT(S_ISREG(inode
->i_mode
));
3560 /* take layout lock mutex to enqueue layout lock exclusively. */
3561 mutex_lock(&lli
->lli_layout_mutex
);
3563 rc
= ll_layout_refresh_locked(inode
);
3567 *gen
= ll_layout_version_get(lli
);
3569 mutex_unlock(&lli
->lli_layout_mutex
);
3575 * This function send a restore request to the MDT
3577 int ll_layout_restore(struct inode
*inode
, loff_t offset
, __u64 length
)
3579 struct hsm_user_request
*hur
;
3582 len
= sizeof(struct hsm_user_request
) +
3583 sizeof(struct hsm_user_item
);
3584 hur
= kzalloc(len
, GFP_NOFS
);
3588 hur
->hur_request
.hr_action
= HUA_RESTORE
;
3589 hur
->hur_request
.hr_archive_id
= 0;
3590 hur
->hur_request
.hr_flags
= 0;
3591 memcpy(&hur
->hur_user_item
[0].hui_fid
, &ll_i2info(inode
)->lli_fid
,
3592 sizeof(hur
->hur_user_item
[0].hui_fid
));
3593 hur
->hur_user_item
[0].hui_extent
.offset
= offset
;
3594 hur
->hur_user_item
[0].hui_extent
.length
= length
;
3595 hur
->hur_request
.hr_itemcount
= 1;
3596 rc
= obd_iocontrol(LL_IOC_HSM_REQUEST
, ll_i2sbi(inode
)->ll_md_exp
,