4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2015, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include "../include/lustre_dlm.h"
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/mount.h>
45 #include "../include/lustre/ll_fiemap.h"
46 #include "../include/lustre/lustre_ioctl.h"
47 #include "../include/lustre_swab.h"
49 #include "../include/cl_object.h"
50 #include "llite_internal.h"
53 ll_put_grouplock(struct inode
*inode
, struct file
*file
, unsigned long arg
);
55 static int ll_lease_close(struct obd_client_handle
*och
, struct inode
*inode
,
58 static enum llioc_iter
59 ll_iocontrol_call(struct inode
*inode
, struct file
*file
,
60 unsigned int cmd
, unsigned long arg
, int *rcp
);
62 static struct ll_file_data
*ll_file_data_get(void)
64 struct ll_file_data
*fd
;
66 fd
= kmem_cache_zalloc(ll_file_data_slab
, GFP_NOFS
);
69 fd
->fd_write_failed
= false;
73 static void ll_file_data_put(struct ll_file_data
*fd
)
76 kmem_cache_free(ll_file_data_slab
, fd
);
80 * Packs all the attributes into @op_data for the CLOSE rpc.
82 static void ll_prepare_close(struct inode
*inode
, struct md_op_data
*op_data
,
83 struct obd_client_handle
*och
)
85 struct ll_inode_info
*lli
= ll_i2info(inode
);
87 ll_prep_md_op_data(op_data
, inode
, NULL
, NULL
,
88 0, 0, LUSTRE_OPC_ANY
, NULL
);
90 op_data
->op_attr
.ia_mode
= inode
->i_mode
;
91 op_data
->op_attr
.ia_atime
= inode
->i_atime
;
92 op_data
->op_attr
.ia_mtime
= inode
->i_mtime
;
93 op_data
->op_attr
.ia_ctime
= inode
->i_ctime
;
94 op_data
->op_attr
.ia_size
= i_size_read(inode
);
95 op_data
->op_attr
.ia_valid
|= ATTR_MODE
| ATTR_ATIME
| ATTR_ATIME_SET
|
96 ATTR_MTIME
| ATTR_MTIME_SET
|
97 ATTR_CTIME
| ATTR_CTIME_SET
;
98 op_data
->op_attr_blocks
= inode
->i_blocks
;
99 op_data
->op_attr_flags
= ll_inode_to_ext_flags(inode
->i_flags
);
100 op_data
->op_handle
= och
->och_fh
;
103 * For HSM: if inode data has been modified, pack it so that
104 * MDT can set data dirty flag in the archive.
106 if (och
->och_flags
& FMODE_WRITE
&&
107 test_and_clear_bit(LLIF_DATA_MODIFIED
, &lli
->lli_flags
))
108 op_data
->op_bias
|= MDS_DATA_MODIFIED
;
112 * Perform a close, possibly with a bias.
113 * The meaning of "data" depends on the value of "bias".
115 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
116 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
119 static int ll_close_inode_openhandle(struct inode
*inode
,
120 struct obd_client_handle
*och
,
121 enum mds_op_bias bias
,
124 const struct ll_inode_info
*lli
= ll_i2info(inode
);
125 struct obd_export
*md_exp
= ll_i2mdexp(inode
);
126 struct md_op_data
*op_data
;
127 struct ptlrpc_request
*req
= NULL
;
130 if (!class_exp2obd(md_exp
)) {
131 CERROR("%s: invalid MDC connection handle closing " DFID
"\n",
132 ll_get_fsname(inode
->i_sb
, NULL
, 0),
133 PFID(&lli
->lli_fid
));
138 op_data
= kzalloc(sizeof(*op_data
), GFP_NOFS
);
140 * We leak openhandle and request here on error, but not much to be
141 * done in OOM case since app won't retry close on error either.
148 ll_prepare_close(inode
, op_data
, och
);
150 case MDS_CLOSE_LAYOUT_SWAP
:
152 op_data
->op_bias
|= MDS_CLOSE_LAYOUT_SWAP
;
153 op_data
->op_data_version
= 0;
154 op_data
->op_lease_handle
= och
->och_lease_handle
;
155 op_data
->op_fid2
= *ll_inode2fid(data
);
158 case MDS_HSM_RELEASE
:
160 op_data
->op_bias
|= MDS_HSM_RELEASE
;
161 op_data
->op_data_version
= *(__u64
*)data
;
162 op_data
->op_lease_handle
= och
->och_lease_handle
;
163 op_data
->op_attr
.ia_valid
|= ATTR_SIZE
| ATTR_BLOCKS
;
171 rc
= md_close(md_exp
, op_data
, och
->och_mod
, &req
);
172 if (rc
&& rc
!= -EINTR
) {
173 CERROR("%s: inode " DFID
" mdc close failed: rc = %d\n",
174 md_exp
->exp_obd
->obd_name
, PFID(&lli
->lli_fid
), rc
);
177 if (op_data
->op_bias
& (MDS_HSM_RELEASE
| MDS_CLOSE_LAYOUT_SWAP
) &&
179 struct mdt_body
*body
;
181 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
182 if (!(body
->mbo_valid
& OBD_MD_CLOSE_INTENT_EXECED
))
186 ll_finish_md_op_data(op_data
);
189 md_clear_open_replay_data(md_exp
, och
);
190 och
->och_fh
.cookie
= DEAD_HANDLE_MAGIC
;
193 ptlrpc_req_finished(req
);
197 int ll_md_real_close(struct inode
*inode
, fmode_t fmode
)
199 struct ll_inode_info
*lli
= ll_i2info(inode
);
200 struct obd_client_handle
**och_p
;
201 struct obd_client_handle
*och
;
205 if (fmode
& FMODE_WRITE
) {
206 och_p
= &lli
->lli_mds_write_och
;
207 och_usecount
= &lli
->lli_open_fd_write_count
;
208 } else if (fmode
& FMODE_EXEC
) {
209 och_p
= &lli
->lli_mds_exec_och
;
210 och_usecount
= &lli
->lli_open_fd_exec_count
;
212 LASSERT(fmode
& FMODE_READ
);
213 och_p
= &lli
->lli_mds_read_och
;
214 och_usecount
= &lli
->lli_open_fd_read_count
;
217 mutex_lock(&lli
->lli_och_mutex
);
218 if (*och_usecount
> 0) {
219 /* There are still users of this handle, so skip
222 mutex_unlock(&lli
->lli_och_mutex
);
228 mutex_unlock(&lli
->lli_och_mutex
);
231 /* There might be a race and this handle may already
234 rc
= ll_close_inode_openhandle(inode
, och
, 0, NULL
);
240 static int ll_md_close(struct inode
*inode
, struct file
*file
)
242 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
243 struct ll_inode_info
*lli
= ll_i2info(inode
);
245 __u64 flags
= LDLM_FL_BLOCK_GRANTED
| LDLM_FL_TEST_LOCK
;
246 struct lustre_handle lockh
;
247 union ldlm_policy_data policy
= {
248 .l_inodebits
= { MDS_INODELOCK_OPEN
}
252 /* clear group lock, if present */
253 if (unlikely(fd
->fd_flags
& LL_FILE_GROUP_LOCKED
))
254 ll_put_grouplock(inode
, file
, fd
->fd_grouplock
.lg_gid
);
256 if (fd
->fd_lease_och
) {
259 /* Usually the lease is not released when the
260 * application crashed, we need to release here.
262 rc
= ll_lease_close(fd
->fd_lease_och
, inode
, &lease_broken
);
263 CDEBUG(rc
? D_ERROR
: D_INODE
,
264 "Clean up lease " DFID
" %d/%d\n",
265 PFID(&lli
->lli_fid
), rc
, lease_broken
);
267 fd
->fd_lease_och
= NULL
;
271 rc
= ll_close_inode_openhandle(inode
, fd
->fd_och
, 0, NULL
);
276 /* Let's see if we have good enough OPEN lock on the file and if
277 * we can skip talking to MDS
280 mutex_lock(&lli
->lli_och_mutex
);
281 if (fd
->fd_omode
& FMODE_WRITE
) {
283 LASSERT(lli
->lli_open_fd_write_count
);
284 lli
->lli_open_fd_write_count
--;
285 } else if (fd
->fd_omode
& FMODE_EXEC
) {
287 LASSERT(lli
->lli_open_fd_exec_count
);
288 lli
->lli_open_fd_exec_count
--;
291 LASSERT(lli
->lli_open_fd_read_count
);
292 lli
->lli_open_fd_read_count
--;
294 mutex_unlock(&lli
->lli_och_mutex
);
296 if (!md_lock_match(ll_i2mdexp(inode
), flags
, ll_inode2fid(inode
),
297 LDLM_IBITS
, &policy
, lockmode
, &lockh
))
298 rc
= ll_md_real_close(inode
, fd
->fd_omode
);
301 LUSTRE_FPRIVATE(file
) = NULL
;
302 ll_file_data_put(fd
);
307 /* While this returns an error code, fput() the caller does not, so we need
308 * to make every effort to clean up all of our state here. Also, applications
309 * rarely check close errors and even if an error is returned they will not
310 * re-try the close call.
312 int ll_file_release(struct inode
*inode
, struct file
*file
)
314 struct ll_file_data
*fd
;
315 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
316 struct ll_inode_info
*lli
= ll_i2info(inode
);
319 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p)\n",
320 PFID(ll_inode2fid(inode
)), inode
);
322 if (!is_root_inode(inode
))
323 ll_stats_ops_tally(sbi
, LPROC_LL_RELEASE
, 1);
324 fd
= LUSTRE_FPRIVATE(file
);
327 /* The last ref on @file, maybe not be the owner pid of statahead,
328 * because parent and child process can share the same file handle.
330 if (S_ISDIR(inode
->i_mode
) && lli
->lli_opendir_key
== fd
)
331 ll_deauthorize_statahead(inode
, fd
);
333 if (is_root_inode(inode
)) {
334 LUSTRE_FPRIVATE(file
) = NULL
;
335 ll_file_data_put(fd
);
339 if (!S_ISDIR(inode
->i_mode
)) {
341 lov_read_and_clear_async_rc(lli
->lli_clob
);
342 lli
->lli_async_rc
= 0;
345 rc
= ll_md_close(inode
, file
);
347 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG
, cfs_fail_val
))
348 libcfs_debug_dumplog();
353 static int ll_intent_file_open(struct dentry
*de
, void *lmm
, int lmmsize
,
354 struct lookup_intent
*itp
)
356 struct inode
*inode
= d_inode(de
);
357 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
358 struct dentry
*parent
= de
->d_parent
;
359 const char *name
= NULL
;
360 struct md_op_data
*op_data
;
361 struct ptlrpc_request
*req
= NULL
;
365 LASSERT(itp
->it_flags
& MDS_OPEN_BY_FID
);
368 * if server supports open-by-fid, or file name is invalid, don't pack
369 * name in open request
371 if (!(exp_connect_flags(sbi
->ll_md_exp
) & OBD_CONNECT_OPEN_BY_FID
) &&
372 lu_name_is_valid_2(de
->d_name
.name
, de
->d_name
.len
)) {
373 name
= de
->d_name
.name
;
374 len
= de
->d_name
.len
;
377 op_data
= ll_prep_md_op_data(NULL
, d_inode(parent
), inode
, name
, len
,
378 O_RDWR
, LUSTRE_OPC_ANY
, NULL
);
380 return PTR_ERR(op_data
);
381 op_data
->op_data
= lmm
;
382 op_data
->op_data_size
= lmmsize
;
384 rc
= md_intent_lock(sbi
->ll_md_exp
, op_data
, itp
, &req
,
385 &ll_md_blocking_ast
, 0);
386 ll_finish_md_op_data(op_data
);
388 /* reason for keep own exit path - don`t flood log
389 * with messages with -ESTALE errors.
391 if (!it_disposition(itp
, DISP_OPEN_OPEN
) ||
392 it_open_error(DISP_OPEN_OPEN
, itp
))
394 ll_release_openhandle(inode
, itp
);
398 if (it_disposition(itp
, DISP_LOOKUP_NEG
)) {
403 if (rc
!= 0 || it_open_error(DISP_OPEN_OPEN
, itp
)) {
404 rc
= rc
? rc
: it_open_error(DISP_OPEN_OPEN
, itp
);
405 CDEBUG(D_VFSTRACE
, "lock enqueue: err: %d\n", rc
);
409 rc
= ll_prep_inode(&inode
, req
, NULL
, itp
);
410 if (!rc
&& itp
->it_lock_mode
)
411 ll_set_lock_data(sbi
->ll_md_exp
, inode
, itp
, NULL
);
414 ptlrpc_req_finished(req
);
415 ll_intent_drop_lock(itp
);
418 * We did open by fid, but by the time we got to the server,
419 * the object disappeared. If this is a create, we cannot really
420 * tell the userspace that the file it was trying to create
421 * does not exist. Instead let's return -ESTALE, and the VFS will
422 * retry the create with LOOKUP_REVAL that we are going to catch
423 * in ll_revalidate_dentry() and use lookup then.
425 if (rc
== -ENOENT
&& itp
->it_op
& IT_CREAT
)
431 static int ll_och_fill(struct obd_export
*md_exp
, struct lookup_intent
*it
,
432 struct obd_client_handle
*och
)
434 struct mdt_body
*body
;
436 body
= req_capsule_server_get(&it
->it_request
->rq_pill
, &RMF_MDT_BODY
);
437 och
->och_fh
= body
->mbo_handle
;
438 och
->och_fid
= body
->mbo_fid1
;
439 och
->och_lease_handle
.cookie
= it
->it_lock_handle
;
440 och
->och_magic
= OBD_CLIENT_HANDLE_MAGIC
;
441 och
->och_flags
= it
->it_flags
;
443 return md_set_open_replay_data(md_exp
, och
, it
);
446 static int ll_local_open(struct file
*file
, struct lookup_intent
*it
,
447 struct ll_file_data
*fd
, struct obd_client_handle
*och
)
449 struct inode
*inode
= file_inode(file
);
451 LASSERT(!LUSTRE_FPRIVATE(file
));
458 rc
= ll_och_fill(ll_i2sbi(inode
)->ll_md_exp
, it
, och
);
463 LUSTRE_FPRIVATE(file
) = fd
;
464 ll_readahead_init(inode
, &fd
->fd_ras
);
465 fd
->fd_omode
= it
->it_flags
& (FMODE_READ
| FMODE_WRITE
| FMODE_EXEC
);
467 /* ll_cl_context initialize */
468 rwlock_init(&fd
->fd_lock
);
469 INIT_LIST_HEAD(&fd
->fd_lccs
);
474 /* Open a file, and (for the very first open) create objects on the OSTs at
475 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
476 * creation or open until ll_lov_setstripe() ioctl is called.
478 * If we already have the stripe MD locally then we don't request it in
479 * md_open(), by passing a lmm_size = 0.
481 * It is up to the application to ensure no other processes open this file
482 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
483 * used. We might be able to avoid races of that sort by getting lli_open_sem
484 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
485 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
487 int ll_file_open(struct inode
*inode
, struct file
*file
)
489 struct ll_inode_info
*lli
= ll_i2info(inode
);
490 struct lookup_intent
*it
, oit
= { .it_op
= IT_OPEN
,
491 .it_flags
= file
->f_flags
};
492 struct obd_client_handle
**och_p
= NULL
;
493 __u64
*och_usecount
= NULL
;
494 struct ll_file_data
*fd
;
497 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p), flags %o\n",
498 PFID(ll_inode2fid(inode
)), inode
, file
->f_flags
);
500 it
= file
->private_data
; /* XXX: compat macro */
501 file
->private_data
= NULL
; /* prevent ll_local_open assertion */
503 fd
= ll_file_data_get();
510 if (S_ISDIR(inode
->i_mode
))
511 ll_authorize_statahead(inode
, fd
);
513 if (is_root_inode(inode
)) {
514 LUSTRE_FPRIVATE(file
) = fd
;
518 if (!it
|| !it
->it_disposition
) {
519 /* Convert f_flags into access mode. We cannot use file->f_mode,
520 * because everything but O_ACCMODE mask was stripped from
523 if ((oit
.it_flags
+ 1) & O_ACCMODE
)
525 if (file
->f_flags
& O_TRUNC
)
526 oit
.it_flags
|= FMODE_WRITE
;
528 /* kernel only call f_op->open in dentry_open. filp_open calls
529 * dentry_open after call to open_namei that checks permissions.
530 * Only nfsd_open call dentry_open directly without checking
531 * permissions and because of that this code below is safe.
533 if (oit
.it_flags
& (FMODE_WRITE
| FMODE_READ
))
534 oit
.it_flags
|= MDS_OPEN_OWNEROVERRIDE
;
536 /* We do not want O_EXCL here, presumably we opened the file
537 * already? XXX - NFS implications?
539 oit
.it_flags
&= ~O_EXCL
;
541 /* bug20584, if "it_flags" contains O_CREAT, the file will be
542 * created if necessary, then "IT_CREAT" should be set to keep
545 if (oit
.it_flags
& O_CREAT
)
546 oit
.it_op
|= IT_CREAT
;
552 /* Let's see if we have file open on MDS already. */
553 if (it
->it_flags
& FMODE_WRITE
) {
554 och_p
= &lli
->lli_mds_write_och
;
555 och_usecount
= &lli
->lli_open_fd_write_count
;
556 } else if (it
->it_flags
& FMODE_EXEC
) {
557 och_p
= &lli
->lli_mds_exec_och
;
558 och_usecount
= &lli
->lli_open_fd_exec_count
;
560 och_p
= &lli
->lli_mds_read_och
;
561 och_usecount
= &lli
->lli_open_fd_read_count
;
564 mutex_lock(&lli
->lli_och_mutex
);
565 if (*och_p
) { /* Open handle is present */
566 if (it_disposition(it
, DISP_OPEN_OPEN
)) {
567 /* Well, there's extra open request that we do not need,
568 * let's close it somehow. This will decref request.
570 rc
= it_open_error(DISP_OPEN_OPEN
, it
);
572 mutex_unlock(&lli
->lli_och_mutex
);
576 ll_release_openhandle(inode
, it
);
580 rc
= ll_local_open(file
, it
, fd
, NULL
);
583 mutex_unlock(&lli
->lli_och_mutex
);
587 LASSERT(*och_usecount
== 0);
588 if (!it
->it_disposition
) {
589 /* We cannot just request lock handle now, new ELC code
590 * means that one of other OPEN locks for this file
591 * could be cancelled, and since blocking ast handler
592 * would attempt to grab och_mutex as well, that would
593 * result in a deadlock
595 mutex_unlock(&lli
->lli_och_mutex
);
597 * Normally called under two situations:
599 * 2. revalidate with IT_OPEN (revalidate doesn't
600 * execute this intent any more).
602 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
604 * Always specify MDS_OPEN_BY_FID because we don't want
605 * to get file with different fid.
607 it
->it_flags
|= MDS_OPEN_LOCK
| MDS_OPEN_BY_FID
;
608 rc
= ll_intent_file_open(file
->f_path
.dentry
, NULL
, 0, it
);
614 *och_p
= kzalloc(sizeof(struct obd_client_handle
), GFP_NOFS
);
622 /* md_intent_lock() didn't get a request ref if there was an
623 * open error, so don't do cleanup on the request here
626 /* XXX (green): Should not we bail out on any error here, not
629 rc
= it_open_error(DISP_OPEN_OPEN
, it
);
633 LASSERTF(it_disposition(it
, DISP_ENQ_OPEN_REF
),
634 "inode %p: disposition %x, status %d\n", inode
,
635 it_disposition(it
, ~0), it
->it_status
);
637 rc
= ll_local_open(file
, it
, fd
, *och_p
);
641 mutex_unlock(&lli
->lli_och_mutex
);
644 /* Must do this outside lli_och_mutex lock to prevent deadlock where
645 * different kind of OPEN lock for this same inode gets cancelled
648 if (!S_ISREG(inode
->i_mode
))
651 cl_lov_delay_create_clear(&file
->f_flags
);
656 if (och_p
&& *och_p
) {
661 mutex_unlock(&lli
->lli_och_mutex
);
664 if (lli
->lli_opendir_key
== fd
)
665 ll_deauthorize_statahead(inode
, fd
);
667 ll_file_data_put(fd
);
669 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_OPEN
, 1);
672 if (it
&& it_disposition(it
, DISP_ENQ_OPEN_REF
)) {
673 ptlrpc_req_finished(it
->it_request
);
674 it_clear_disposition(it
, DISP_ENQ_OPEN_REF
);
680 static int ll_md_blocking_lease_ast(struct ldlm_lock
*lock
,
681 struct ldlm_lock_desc
*desc
,
682 void *data
, int flag
)
685 struct lustre_handle lockh
;
688 case LDLM_CB_BLOCKING
:
689 ldlm_lock2handle(lock
, &lockh
);
690 rc
= ldlm_cli_cancel(&lockh
, LCF_ASYNC
);
692 CDEBUG(D_INODE
, "ldlm_cli_cancel: %d\n", rc
);
696 case LDLM_CB_CANCELING
:
704 * Acquire a lease and open the file.
706 static struct obd_client_handle
*
707 ll_lease_open(struct inode
*inode
, struct file
*file
, fmode_t fmode
,
710 struct lookup_intent it
= { .it_op
= IT_OPEN
};
711 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
712 struct md_op_data
*op_data
;
713 struct ptlrpc_request
*req
= NULL
;
714 struct lustre_handle old_handle
= { 0 };
715 struct obd_client_handle
*och
= NULL
;
719 if (fmode
!= FMODE_WRITE
&& fmode
!= FMODE_READ
)
720 return ERR_PTR(-EINVAL
);
723 struct ll_inode_info
*lli
= ll_i2info(inode
);
724 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
725 struct obd_client_handle
**och_p
;
728 if (!(fmode
& file
->f_mode
) || (file
->f_mode
& FMODE_EXEC
))
729 return ERR_PTR(-EPERM
);
731 /* Get the openhandle of the file */
733 mutex_lock(&lli
->lli_och_mutex
);
734 if (fd
->fd_lease_och
) {
735 mutex_unlock(&lli
->lli_och_mutex
);
740 if (file
->f_mode
& FMODE_WRITE
) {
741 LASSERT(lli
->lli_mds_write_och
);
742 och_p
= &lli
->lli_mds_write_och
;
743 och_usecount
= &lli
->lli_open_fd_write_count
;
745 LASSERT(lli
->lli_mds_read_och
);
746 och_p
= &lli
->lli_mds_read_och
;
747 och_usecount
= &lli
->lli_open_fd_read_count
;
749 if (*och_usecount
== 1) {
756 mutex_unlock(&lli
->lli_och_mutex
);
757 if (rc
< 0) /* more than 1 opener */
761 old_handle
= fd
->fd_och
->och_fh
;
764 och
= kzalloc(sizeof(*och
), GFP_NOFS
);
766 return ERR_PTR(-ENOMEM
);
768 op_data
= ll_prep_md_op_data(NULL
, inode
, inode
, NULL
, 0, 0,
769 LUSTRE_OPC_ANY
, NULL
);
770 if (IS_ERR(op_data
)) {
771 rc
= PTR_ERR(op_data
);
775 /* To tell the MDT this openhandle is from the same owner */
776 op_data
->op_handle
= old_handle
;
778 it
.it_flags
= fmode
| open_flags
;
779 it
.it_flags
|= MDS_OPEN_LOCK
| MDS_OPEN_BY_FID
| MDS_OPEN_LEASE
;
780 rc
= md_intent_lock(sbi
->ll_md_exp
, op_data
, &it
, &req
,
781 &ll_md_blocking_lease_ast
,
782 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
783 * it can be cancelled which may mislead applications that the lease is
785 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
786 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
787 * doesn't deal with openhandle, so normal openhandle will be leaked.
789 LDLM_FL_NO_LRU
| LDLM_FL_EXCL
);
790 ll_finish_md_op_data(op_data
);
791 ptlrpc_req_finished(req
);
795 if (it_disposition(&it
, DISP_LOOKUP_NEG
)) {
800 rc
= it_open_error(DISP_OPEN_OPEN
, &it
);
804 LASSERT(it_disposition(&it
, DISP_ENQ_OPEN_REF
));
805 ll_och_fill(sbi
->ll_md_exp
, &it
, och
);
807 if (!it_disposition(&it
, DISP_OPEN_LEASE
)) /* old server? */ {
812 /* already get lease, handle lease lock */
813 ll_set_lock_data(sbi
->ll_md_exp
, inode
, &it
, NULL
);
814 if (it
.it_lock_mode
== 0 ||
815 it
.it_lock_bits
!= MDS_INODELOCK_OPEN
) {
816 /* open lock must return for lease */
817 CERROR(DFID
"lease granted but no open lock, %d/%llu.\n",
818 PFID(ll_inode2fid(inode
)), it
.it_lock_mode
,
824 ll_intent_release(&it
);
828 /* Cancel open lock */
829 if (it
.it_lock_mode
!= 0) {
830 ldlm_lock_decref_and_cancel(&och
->och_lease_handle
,
833 och
->och_lease_handle
.cookie
= 0ULL;
835 rc2
= ll_close_inode_openhandle(inode
, och
, 0, NULL
);
837 CERROR("%s: error closing file " DFID
": %d\n",
838 ll_get_fsname(inode
->i_sb
, NULL
, 0),
839 PFID(&ll_i2info(inode
)->lli_fid
), rc2
);
840 och
= NULL
; /* och has been freed in ll_close_inode_openhandle() */
842 ll_intent_release(&it
);
849 * Check whether a layout swap can be done between two inodes.
851 * \param[in] inode1 First inode to check
852 * \param[in] inode2 Second inode to check
854 * \retval 0 on success, layout swap can be performed between both inodes
855 * \retval negative error code if requirements are not met
857 static int ll_check_swap_layouts_validity(struct inode
*inode1
,
858 struct inode
*inode2
)
860 if (!S_ISREG(inode1
->i_mode
) || !S_ISREG(inode2
->i_mode
))
863 if (inode_permission(inode1
, MAY_WRITE
) ||
864 inode_permission(inode2
, MAY_WRITE
))
867 if (inode1
->i_sb
!= inode2
->i_sb
)
873 static int ll_swap_layouts_close(struct obd_client_handle
*och
,
874 struct inode
*inode
, struct inode
*inode2
)
876 const struct lu_fid
*fid1
= ll_inode2fid(inode
);
877 const struct lu_fid
*fid2
;
880 CDEBUG(D_INODE
, "%s: biased close of file " DFID
"\n",
881 ll_get_fsname(inode
->i_sb
, NULL
, 0), PFID(fid1
));
883 rc
= ll_check_swap_layouts_validity(inode
, inode2
);
887 /* We now know that inode2 is a lustre inode */
888 fid2
= ll_inode2fid(inode2
);
890 rc
= lu_fid_cmp(fid1
, fid2
);
897 * Close the file and swap layouts between inode & inode2.
898 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
899 * because we still need it to pack l_remote_handle to MDT.
901 rc
= ll_close_inode_openhandle(inode
, och
, MDS_CLOSE_LAYOUT_SWAP
,
904 och
= NULL
; /* freed in ll_close_inode_openhandle() */
912 * Release lease and close the file.
913 * It will check if the lease has ever broken.
915 static int ll_lease_close(struct obd_client_handle
*och
, struct inode
*inode
,
918 struct ldlm_lock
*lock
;
919 bool cancelled
= true;
921 lock
= ldlm_handle2lock(&och
->och_lease_handle
);
923 lock_res_and_lock(lock
);
924 cancelled
= ldlm_is_cancel(lock
);
925 unlock_res_and_lock(lock
);
929 CDEBUG(D_INODE
, "lease for " DFID
" broken? %d\n",
930 PFID(&ll_i2info(inode
)->lli_fid
), cancelled
);
933 ldlm_cli_cancel(&och
->och_lease_handle
, 0);
935 *lease_broken
= cancelled
;
937 return ll_close_inode_openhandle(inode
, och
, 0, NULL
);
940 int ll_merge_attr(const struct lu_env
*env
, struct inode
*inode
)
942 struct ll_inode_info
*lli
= ll_i2info(inode
);
943 struct cl_object
*obj
= lli
->lli_clob
;
944 struct cl_attr
*attr
= vvp_env_thread_attr(env
);
950 ll_inode_size_lock(inode
);
952 /* merge timestamps the most recently obtained from mds with
953 * timestamps obtained from osts
955 LTIME_S(inode
->i_atime
) = lli
->lli_atime
;
956 LTIME_S(inode
->i_mtime
) = lli
->lli_mtime
;
957 LTIME_S(inode
->i_ctime
) = lli
->lli_ctime
;
959 mtime
= LTIME_S(inode
->i_mtime
);
960 atime
= LTIME_S(inode
->i_atime
);
961 ctime
= LTIME_S(inode
->i_ctime
);
963 cl_object_attr_lock(obj
);
964 rc
= cl_object_attr_get(env
, obj
, attr
);
965 cl_object_attr_unlock(obj
);
968 goto out_size_unlock
;
970 if (atime
< attr
->cat_atime
)
971 atime
= attr
->cat_atime
;
973 if (ctime
< attr
->cat_ctime
)
974 ctime
= attr
->cat_ctime
;
976 if (mtime
< attr
->cat_mtime
)
977 mtime
= attr
->cat_mtime
;
979 CDEBUG(D_VFSTRACE
, DFID
" updating i_size %llu\n",
980 PFID(&lli
->lli_fid
), attr
->cat_size
);
982 i_size_write(inode
, attr
->cat_size
);
984 inode
->i_blocks
= attr
->cat_blocks
;
986 LTIME_S(inode
->i_mtime
) = mtime
;
987 LTIME_S(inode
->i_atime
) = atime
;
988 LTIME_S(inode
->i_ctime
) = ctime
;
991 ll_inode_size_unlock(inode
);
996 static bool file_is_noatime(const struct file
*file
)
998 const struct vfsmount
*mnt
= file
->f_path
.mnt
;
999 const struct inode
*inode
= file_inode(file
);
1001 /* Adapted from file_accessed() and touch_atime().*/
1002 if (file
->f_flags
& O_NOATIME
)
1005 if (inode
->i_flags
& S_NOATIME
)
1008 if (IS_NOATIME(inode
))
1011 if (mnt
->mnt_flags
& (MNT_NOATIME
| MNT_READONLY
))
1014 if ((mnt
->mnt_flags
& MNT_NODIRATIME
) && S_ISDIR(inode
->i_mode
))
1017 if ((inode
->i_sb
->s_flags
& MS_NODIRATIME
) && S_ISDIR(inode
->i_mode
))
1023 static void ll_io_init(struct cl_io
*io
, const struct file
*file
, int write
)
1025 struct inode
*inode
= file_inode(file
);
1027 io
->u
.ci_rw
.crw_nonblock
= file
->f_flags
& O_NONBLOCK
;
1029 io
->u
.ci_wr
.wr_append
= !!(file
->f_flags
& O_APPEND
);
1030 io
->u
.ci_wr
.wr_sync
= file
->f_flags
& O_SYNC
||
1031 file
->f_flags
& O_DIRECT
||
1034 io
->ci_obj
= ll_i2info(inode
)->lli_clob
;
1035 io
->ci_lockreq
= CILR_MAYBE
;
1036 if (ll_file_nolock(file
)) {
1037 io
->ci_lockreq
= CILR_NEVER
;
1038 io
->ci_no_srvlock
= 1;
1039 } else if (file
->f_flags
& O_APPEND
) {
1040 io
->ci_lockreq
= CILR_MANDATORY
;
1043 io
->ci_noatime
= file_is_noatime(file
);
1047 ll_file_io_generic(const struct lu_env
*env
, struct vvp_io_args
*args
,
1048 struct file
*file
, enum cl_io_type iot
,
1049 loff_t
*ppos
, size_t count
)
1051 struct ll_inode_info
*lli
= ll_i2info(file_inode(file
));
1052 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1053 struct vvp_io
*vio
= vvp_env_io(env
);
1054 struct range_lock range
;
1059 CDEBUG(D_VFSTRACE
, "file: %pD, type: %d ppos: %llu, count: %zu\n",
1060 file
, iot
, *ppos
, count
);
1063 io
= vvp_env_thread_io(env
);
1064 ll_io_init(io
, file
, iot
== CIT_WRITE
);
1066 if (cl_io_rw_init(env
, io
, iot
, *ppos
, count
) == 0) {
1067 struct vvp_io
*vio
= vvp_env_io(env
);
1068 bool range_locked
= false;
1070 if (file
->f_flags
& O_APPEND
)
1071 range_lock_init(&range
, 0, LUSTRE_EOF
);
1073 range_lock_init(&range
, *ppos
, *ppos
+ count
- 1);
1075 vio
->vui_fd
= LUSTRE_FPRIVATE(file
);
1076 vio
->vui_iter
= args
->u
.normal
.via_iter
;
1077 vio
->vui_iocb
= args
->u
.normal
.via_iocb
;
1079 * Direct IO reads must also take range lock,
1080 * or multiple reads will try to work on the same pages
1081 * See LU-6227 for details.
1083 if (((iot
== CIT_WRITE
) ||
1084 (iot
== CIT_READ
&& (file
->f_flags
& O_DIRECT
))) &&
1085 !(vio
->vui_fd
->fd_flags
& LL_FILE_GROUP_LOCKED
)) {
1086 CDEBUG(D_VFSTRACE
, "Range lock [%llu, %llu]\n",
1087 range
.rl_node
.in_extent
.start
,
1088 range
.rl_node
.in_extent
.end
);
1089 rc
= range_lock(&lli
->lli_write_tree
, &range
);
1093 range_locked
= true;
1095 ll_cl_add(file
, env
, io
);
1096 rc
= cl_io_loop(env
, io
);
1097 ll_cl_remove(file
, env
);
1099 CDEBUG(D_VFSTRACE
, "Range unlock [%llu, %llu]\n",
1100 range
.rl_node
.in_extent
.start
,
1101 range
.rl_node
.in_extent
.end
);
1102 range_unlock(&lli
->lli_write_tree
, &range
);
1105 /* cl_io_rw_init() handled IO */
1109 if (io
->ci_nob
> 0) {
1110 result
= io
->ci_nob
;
1111 count
-= io
->ci_nob
;
1112 *ppos
= io
->u
.ci_wr
.wr
.crw_pos
;
1114 /* prepare IO restart */
1116 args
->u
.normal
.via_iter
= vio
->vui_iter
;
1119 cl_io_fini(env
, io
);
1121 if ((!rc
|| rc
== -ENODATA
) && count
> 0 && io
->ci_need_restart
) {
1122 CDEBUG(D_VFSTRACE
, "%s: restart %s from %lld, count:%zu, result: %zd\n",
1123 file_dentry(file
)->d_name
.name
,
1124 iot
== CIT_READ
? "read" : "write",
1125 *ppos
, count
, result
);
1129 if (iot
== CIT_READ
) {
1131 ll_stats_ops_tally(ll_i2sbi(file_inode(file
)),
1132 LPROC_LL_READ_BYTES
, result
);
1133 } else if (iot
== CIT_WRITE
) {
1135 ll_stats_ops_tally(ll_i2sbi(file_inode(file
)),
1136 LPROC_LL_WRITE_BYTES
, result
);
1137 fd
->fd_write_failed
= false;
1138 } else if (!result
&& !rc
) {
1141 fd
->fd_write_failed
= true;
1143 fd
->fd_write_failed
= false;
1144 } else if (rc
!= -ERESTARTSYS
) {
1145 fd
->fd_write_failed
= true;
1148 CDEBUG(D_VFSTRACE
, "iot: %d, result: %zd\n", iot
, result
);
1150 return result
> 0 ? result
: rc
;
1153 static ssize_t
ll_file_read_iter(struct kiocb
*iocb
, struct iov_iter
*to
)
1156 struct vvp_io_args
*args
;
1160 env
= cl_env_get(&refcheck
);
1162 return PTR_ERR(env
);
1164 args
= ll_env_args(env
);
1165 args
->u
.normal
.via_iter
= to
;
1166 args
->u
.normal
.via_iocb
= iocb
;
1168 result
= ll_file_io_generic(env
, args
, iocb
->ki_filp
, CIT_READ
,
1169 &iocb
->ki_pos
, iov_iter_count(to
));
1170 cl_env_put(env
, &refcheck
);
1175 * Write to a file (through the page cache).
1177 static ssize_t
ll_file_write_iter(struct kiocb
*iocb
, struct iov_iter
*from
)
1180 struct vvp_io_args
*args
;
1184 env
= cl_env_get(&refcheck
);
1186 return PTR_ERR(env
);
1188 args
= ll_env_args(env
);
1189 args
->u
.normal
.via_iter
= from
;
1190 args
->u
.normal
.via_iocb
= iocb
;
1192 result
= ll_file_io_generic(env
, args
, iocb
->ki_filp
, CIT_WRITE
,
1193 &iocb
->ki_pos
, iov_iter_count(from
));
1194 cl_env_put(env
, &refcheck
);
1198 int ll_lov_setstripe_ea_info(struct inode
*inode
, struct dentry
*dentry
,
1199 __u64 flags
, struct lov_user_md
*lum
,
1202 struct lookup_intent oit
= {
1204 .it_flags
= flags
| MDS_OPEN_BY_FID
,
1208 ll_inode_size_lock(inode
);
1209 rc
= ll_intent_file_open(dentry
, lum
, lum_size
, &oit
);
1213 ll_release_openhandle(inode
, &oit
);
1216 ll_inode_size_unlock(inode
);
1217 ll_intent_release(&oit
);
1221 int ll_lov_getstripe_ea_info(struct inode
*inode
, const char *filename
,
1222 struct lov_mds_md
**lmmp
, int *lmm_size
,
1223 struct ptlrpc_request
**request
)
1225 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
1226 struct mdt_body
*body
;
1227 struct lov_mds_md
*lmm
= NULL
;
1228 struct ptlrpc_request
*req
= NULL
;
1229 struct md_op_data
*op_data
;
1232 rc
= ll_get_default_mdsize(sbi
, &lmmsize
);
1236 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, filename
,
1237 strlen(filename
), lmmsize
,
1238 LUSTRE_OPC_ANY
, NULL
);
1239 if (IS_ERR(op_data
))
1240 return PTR_ERR(op_data
);
1242 op_data
->op_valid
= OBD_MD_FLEASIZE
| OBD_MD_FLDIREA
;
1243 rc
= md_getattr_name(sbi
->ll_md_exp
, op_data
, &req
);
1244 ll_finish_md_op_data(op_data
);
1246 CDEBUG(D_INFO
, "md_getattr_name failed on %s: rc %d\n",
1251 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
1253 lmmsize
= body
->mbo_eadatasize
;
1255 if (!(body
->mbo_valid
& (OBD_MD_FLEASIZE
| OBD_MD_FLDIREA
)) ||
1261 lmm
= req_capsule_server_sized_get(&req
->rq_pill
, &RMF_MDT_MD
, lmmsize
);
1263 if ((lmm
->lmm_magic
!= cpu_to_le32(LOV_MAGIC_V1
)) &&
1264 (lmm
->lmm_magic
!= cpu_to_le32(LOV_MAGIC_V3
))) {
1270 * This is coming from the MDS, so is probably in
1271 * little endian. We convert it to host endian before
1272 * passing it to userspace.
1274 if (cpu_to_le32(LOV_MAGIC
) != LOV_MAGIC
) {
1277 stripe_count
= le16_to_cpu(lmm
->lmm_stripe_count
);
1278 if (le32_to_cpu(lmm
->lmm_pattern
) & LOV_PATTERN_F_RELEASED
)
1281 /* if function called for directory - we should
1282 * avoid swab not existent lsm objects
1284 if (lmm
->lmm_magic
== cpu_to_le32(LOV_MAGIC_V1
)) {
1285 lustre_swab_lov_user_md_v1((struct lov_user_md_v1
*)lmm
);
1286 if (S_ISREG(body
->mbo_mode
))
1287 lustre_swab_lov_user_md_objects(
1288 ((struct lov_user_md_v1
*)lmm
)->lmm_objects
,
1290 } else if (lmm
->lmm_magic
== cpu_to_le32(LOV_MAGIC_V3
)) {
1291 lustre_swab_lov_user_md_v3((struct lov_user_md_v3
*)lmm
);
1292 if (S_ISREG(body
->mbo_mode
))
1293 lustre_swab_lov_user_md_objects(
1294 ((struct lov_user_md_v3
*)lmm
)->lmm_objects
,
1301 *lmm_size
= lmmsize
;
1306 static int ll_lov_setea(struct inode
*inode
, struct file
*file
,
1309 __u64 flags
= MDS_OPEN_HAS_OBJS
| FMODE_WRITE
;
1310 struct lov_user_md
*lump
;
1311 int lum_size
= sizeof(struct lov_user_md
) +
1312 sizeof(struct lov_user_ost_data
);
1315 if (!capable(CFS_CAP_SYS_ADMIN
))
1318 lump
= libcfs_kvzalloc(lum_size
, GFP_NOFS
);
1322 if (copy_from_user(lump
, (struct lov_user_md __user
*)arg
, lum_size
)) {
1327 rc
= ll_lov_setstripe_ea_info(inode
, file
->f_path
.dentry
, flags
, lump
,
1329 cl_lov_delay_create_clear(&file
->f_flags
);
1335 static int ll_file_getstripe(struct inode
*inode
,
1336 struct lov_user_md __user
*lum
)
1342 env
= cl_env_get(&refcheck
);
1344 return PTR_ERR(env
);
1346 rc
= cl_object_getstripe(env
, ll_i2info(inode
)->lli_clob
, lum
);
1347 cl_env_put(env
, &refcheck
);
1351 static int ll_lov_setstripe(struct inode
*inode
, struct file
*file
,
1354 struct lov_user_md __user
*lum
= (struct lov_user_md __user
*)arg
;
1355 struct lov_user_md
*klum
;
1357 __u64 flags
= FMODE_WRITE
;
1359 rc
= ll_copy_user_md(lum
, &klum
);
1364 rc
= ll_lov_setstripe_ea_info(inode
, file
->f_path
.dentry
, flags
, klum
,
1366 cl_lov_delay_create_clear(&file
->f_flags
);
1370 put_user(0, &lum
->lmm_stripe_count
);
1372 ll_layout_refresh(inode
, &gen
);
1373 rc
= ll_file_getstripe(inode
, (struct lov_user_md __user
*)arg
);
1381 ll_get_grouplock(struct inode
*inode
, struct file
*file
, unsigned long arg
)
1383 struct ll_inode_info
*lli
= ll_i2info(inode
);
1384 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1385 struct ll_grouplock grouplock
;
1389 CWARN("group id for group lock must not be 0\n");
1393 if (ll_file_nolock(file
))
1396 spin_lock(&lli
->lli_lock
);
1397 if (fd
->fd_flags
& LL_FILE_GROUP_LOCKED
) {
1398 CWARN("group lock already existed with gid %lu\n",
1399 fd
->fd_grouplock
.lg_gid
);
1400 spin_unlock(&lli
->lli_lock
);
1403 LASSERT(!fd
->fd_grouplock
.lg_lock
);
1404 spin_unlock(&lli
->lli_lock
);
1406 rc
= cl_get_grouplock(ll_i2info(inode
)->lli_clob
,
1407 arg
, (file
->f_flags
& O_NONBLOCK
), &grouplock
);
1411 spin_lock(&lli
->lli_lock
);
1412 if (fd
->fd_flags
& LL_FILE_GROUP_LOCKED
) {
1413 spin_unlock(&lli
->lli_lock
);
1414 CERROR("another thread just won the race\n");
1415 cl_put_grouplock(&grouplock
);
1419 fd
->fd_flags
|= LL_FILE_GROUP_LOCKED
;
1420 fd
->fd_grouplock
= grouplock
;
1421 spin_unlock(&lli
->lli_lock
);
1423 CDEBUG(D_INFO
, "group lock %lu obtained\n", arg
);
1427 static int ll_put_grouplock(struct inode
*inode
, struct file
*file
,
1430 struct ll_inode_info
*lli
= ll_i2info(inode
);
1431 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1432 struct ll_grouplock grouplock
;
1434 spin_lock(&lli
->lli_lock
);
1435 if (!(fd
->fd_flags
& LL_FILE_GROUP_LOCKED
)) {
1436 spin_unlock(&lli
->lli_lock
);
1437 CWARN("no group lock held\n");
1440 LASSERT(fd
->fd_grouplock
.lg_lock
);
1442 if (fd
->fd_grouplock
.lg_gid
!= arg
) {
1443 CWARN("group lock %lu doesn't match current id %lu\n",
1444 arg
, fd
->fd_grouplock
.lg_gid
);
1445 spin_unlock(&lli
->lli_lock
);
1449 grouplock
= fd
->fd_grouplock
;
1450 memset(&fd
->fd_grouplock
, 0, sizeof(fd
->fd_grouplock
));
1451 fd
->fd_flags
&= ~LL_FILE_GROUP_LOCKED
;
1452 spin_unlock(&lli
->lli_lock
);
1454 cl_put_grouplock(&grouplock
);
1455 CDEBUG(D_INFO
, "group lock %lu released\n", arg
);
1460 * Close inode open handle
1462 * \param inode [in] inode in question
1463 * \param it [in,out] intent which contains open info and result
1466 * \retval <0 failure
1468 int ll_release_openhandle(struct inode
*inode
, struct lookup_intent
*it
)
1470 struct obd_client_handle
*och
;
1475 /* Root ? Do nothing. */
1476 if (is_root_inode(inode
))
1479 /* No open handle to close? Move away */
1480 if (!it_disposition(it
, DISP_OPEN_OPEN
))
1483 LASSERT(it_open_error(DISP_OPEN_OPEN
, it
) == 0);
1485 och
= kzalloc(sizeof(*och
), GFP_NOFS
);
1491 ll_och_fill(ll_i2sbi(inode
)->ll_md_exp
, it
, och
);
1493 rc
= ll_close_inode_openhandle(inode
, och
, 0, NULL
);
1495 /* this one is in place of ll_file_open */
1496 if (it_disposition(it
, DISP_ENQ_OPEN_REF
)) {
1497 ptlrpc_req_finished(it
->it_request
);
1498 it_clear_disposition(it
, DISP_ENQ_OPEN_REF
);
1504 * Get size for inode for which FIEMAP mapping is requested.
1505 * Make the FIEMAP get_info call and returns the result.
1507 * \param fiemap kernel buffer to hold extens
1508 * \param num_bytes kernel buffer size
1510 static int ll_do_fiemap(struct inode
*inode
, struct fiemap
*fiemap
,
1513 struct ll_fiemap_info_key fmkey
= { .lfik_name
= KEY_FIEMAP
, };
1518 /* Checks for fiemap flags */
1519 if (fiemap
->fm_flags
& ~LUSTRE_FIEMAP_FLAGS_COMPAT
) {
1520 fiemap
->fm_flags
&= ~LUSTRE_FIEMAP_FLAGS_COMPAT
;
1524 /* Check for FIEMAP_FLAG_SYNC */
1525 if (fiemap
->fm_flags
& FIEMAP_FLAG_SYNC
) {
1526 rc
= filemap_fdatawrite(inode
->i_mapping
);
1531 env
= cl_env_get(&refcheck
);
1533 return PTR_ERR(env
);
1535 if (i_size_read(inode
) == 0) {
1536 rc
= ll_glimpse_size(inode
);
1541 fmkey
.lfik_oa
.o_valid
= OBD_MD_FLID
| OBD_MD_FLGROUP
;
1542 obdo_from_inode(&fmkey
.lfik_oa
, inode
, OBD_MD_FLSIZE
);
1543 obdo_set_parent_fid(&fmkey
.lfik_oa
, &ll_i2info(inode
)->lli_fid
);
1545 /* If filesize is 0, then there would be no objects for mapping */
1546 if (fmkey
.lfik_oa
.o_size
== 0) {
1547 fiemap
->fm_mapped_extents
= 0;
1552 memcpy(&fmkey
.lfik_fiemap
, fiemap
, sizeof(*fiemap
));
1554 rc
= cl_object_fiemap(env
, ll_i2info(inode
)->lli_clob
,
1555 &fmkey
, fiemap
, &num_bytes
);
1557 cl_env_put(env
, &refcheck
);
1561 int ll_fid2path(struct inode
*inode
, void __user
*arg
)
1563 struct obd_export
*exp
= ll_i2mdexp(inode
);
1564 const struct getinfo_fid2path __user
*gfin
= arg
;
1565 struct getinfo_fid2path
*gfout
;
1570 if (!capable(CFS_CAP_DAC_READ_SEARCH
) &&
1571 !(ll_i2sbi(inode
)->ll_flags
& LL_SBI_USER_FID2PATH
))
1574 /* Only need to get the buflen */
1575 if (get_user(pathlen
, &gfin
->gf_pathlen
))
1578 if (pathlen
> PATH_MAX
)
1581 outsize
= sizeof(*gfout
) + pathlen
;
1583 gfout
= kzalloc(outsize
, GFP_NOFS
);
1587 if (copy_from_user(gfout
, arg
, sizeof(*gfout
))) {
1592 /* Call mdc_iocontrol */
1593 rc
= obd_iocontrol(OBD_IOC_FID2PATH
, exp
, outsize
, gfout
, NULL
);
1597 if (copy_to_user(arg
, gfout
, outsize
))
1606 * Read the data_version for inode.
1608 * This value is computed using stripe object version on OST.
1609 * Version is computed using server side locking.
1611 * @param flags if do sync on the OST side;
1613 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1614 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1616 int ll_data_version(struct inode
*inode
, __u64
*data_version
, int flags
)
1618 struct cl_object
*obj
= ll_i2info(inode
)->lli_clob
;
1624 /* If no file object initialized, we consider its version is 0. */
1630 env
= cl_env_get(&refcheck
);
1632 return PTR_ERR(env
);
1634 io
= vvp_env_thread_io(env
);
1636 io
->u
.ci_data_version
.dv_data_version
= 0;
1637 io
->u
.ci_data_version
.dv_flags
= flags
;
1640 if (!cl_io_init(env
, io
, CIT_DATA_VERSION
, io
->ci_obj
))
1641 result
= cl_io_loop(env
, io
);
1643 result
= io
->ci_result
;
1645 *data_version
= io
->u
.ci_data_version
.dv_data_version
;
1647 cl_io_fini(env
, io
);
1649 if (unlikely(io
->ci_need_restart
))
1652 cl_env_put(env
, &refcheck
);
1658 * Trigger a HSM release request for the provided inode.
1660 int ll_hsm_release(struct inode
*inode
)
1663 struct obd_client_handle
*och
= NULL
;
1664 __u64 data_version
= 0;
1668 CDEBUG(D_INODE
, "%s: Releasing file " DFID
".\n",
1669 ll_get_fsname(inode
->i_sb
, NULL
, 0),
1670 PFID(&ll_i2info(inode
)->lli_fid
));
1672 och
= ll_lease_open(inode
, NULL
, FMODE_WRITE
, MDS_OPEN_RELEASE
);
1678 /* Grab latest data_version and [am]time values */
1679 rc
= ll_data_version(inode
, &data_version
, LL_DV_WR_FLUSH
);
1683 env
= cl_env_get(&refcheck
);
1689 ll_merge_attr(env
, inode
);
1690 cl_env_put(env
, &refcheck
);
1692 /* Release the file.
1693 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1694 * we still need it to pack l_remote_handle to MDT.
1696 rc
= ll_close_inode_openhandle(inode
, och
, MDS_HSM_RELEASE
,
1701 if (och
&& !IS_ERR(och
)) /* close the file */
1702 ll_lease_close(och
, inode
, NULL
);
1707 struct ll_swap_stack
{
1710 struct inode
*inode1
;
1711 struct inode
*inode2
;
1716 static int ll_swap_layouts(struct file
*file1
, struct file
*file2
,
1717 struct lustre_swap_layouts
*lsl
)
1719 struct mdc_swap_layouts msl
;
1720 struct md_op_data
*op_data
;
1723 struct ll_swap_stack
*llss
= NULL
;
1726 llss
= kzalloc(sizeof(*llss
), GFP_NOFS
);
1730 llss
->inode1
= file_inode(file1
);
1731 llss
->inode2
= file_inode(file2
);
1733 rc
= ll_check_swap_layouts_validity(llss
->inode1
, llss
->inode2
);
1737 /* we use 2 bool because it is easier to swap than 2 bits */
1738 if (lsl
->sl_flags
& SWAP_LAYOUTS_CHECK_DV1
)
1739 llss
->check_dv1
= true;
1741 if (lsl
->sl_flags
& SWAP_LAYOUTS_CHECK_DV2
)
1742 llss
->check_dv2
= true;
1744 /* we cannot use lsl->sl_dvX directly because we may swap them */
1745 llss
->dv1
= lsl
->sl_dv1
;
1746 llss
->dv2
= lsl
->sl_dv2
;
1748 rc
= lu_fid_cmp(ll_inode2fid(llss
->inode1
), ll_inode2fid(llss
->inode2
));
1749 if (!rc
) /* same file, done! */
1752 if (rc
< 0) { /* sequentialize it */
1753 swap(llss
->inode1
, llss
->inode2
);
1755 swap(llss
->dv1
, llss
->dv2
);
1756 swap(llss
->check_dv1
, llss
->check_dv2
);
1760 if (gid
!= 0) { /* application asks to flush dirty cache */
1761 rc
= ll_get_grouplock(llss
->inode1
, file1
, gid
);
1765 rc
= ll_get_grouplock(llss
->inode2
, file2
, gid
);
1767 ll_put_grouplock(llss
->inode1
, file1
, gid
);
1772 /* ultimate check, before swapping the layouts we check if
1773 * dataversion has changed (if requested)
1775 if (llss
->check_dv1
) {
1776 rc
= ll_data_version(llss
->inode1
, &dv
, 0);
1779 if (dv
!= llss
->dv1
) {
1785 if (llss
->check_dv2
) {
1786 rc
= ll_data_version(llss
->inode2
, &dv
, 0);
1789 if (dv
!= llss
->dv2
) {
1795 /* struct md_op_data is used to send the swap args to the mdt
1796 * only flags is missing, so we use struct mdc_swap_layouts
1797 * through the md_op_data->op_data
1799 /* flags from user space have to be converted before they are send to
1800 * server, no flag is sent today, they are only used on the client
1804 op_data
= ll_prep_md_op_data(NULL
, llss
->inode1
, llss
->inode2
, NULL
, 0,
1805 0, LUSTRE_OPC_ANY
, &msl
);
1806 if (IS_ERR(op_data
)) {
1807 rc
= PTR_ERR(op_data
);
1811 rc
= obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS
, ll_i2mdexp(llss
->inode1
),
1812 sizeof(*op_data
), op_data
, NULL
);
1813 ll_finish_md_op_data(op_data
);
1817 ll_put_grouplock(llss
->inode2
, file2
, gid
);
1818 ll_put_grouplock(llss
->inode1
, file1
, gid
);
1827 int ll_hsm_state_set(struct inode
*inode
, struct hsm_state_set
*hss
)
1829 struct md_op_data
*op_data
;
1832 /* Detect out-of range masks */
1833 if ((hss
->hss_setmask
| hss
->hss_clearmask
) & ~HSM_FLAGS_MASK
)
1836 /* Non-root users are forbidden to set or clear flags which are
1837 * NOT defined in HSM_USER_MASK.
1839 if (((hss
->hss_setmask
| hss
->hss_clearmask
) & ~HSM_USER_MASK
) &&
1840 !capable(CFS_CAP_SYS_ADMIN
))
1843 /* Detect out-of range archive id */
1844 if ((hss
->hss_valid
& HSS_ARCHIVE_ID
) &&
1845 (hss
->hss_archive_id
> LL_HSM_MAX_ARCHIVE
))
1848 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
1849 LUSTRE_OPC_ANY
, hss
);
1850 if (IS_ERR(op_data
))
1851 return PTR_ERR(op_data
);
1853 rc
= obd_iocontrol(LL_IOC_HSM_STATE_SET
, ll_i2mdexp(inode
),
1854 sizeof(*op_data
), op_data
, NULL
);
1856 ll_finish_md_op_data(op_data
);
1861 static int ll_hsm_import(struct inode
*inode
, struct file
*file
,
1862 struct hsm_user_import
*hui
)
1864 struct hsm_state_set
*hss
= NULL
;
1865 struct iattr
*attr
= NULL
;
1868 if (!S_ISREG(inode
->i_mode
))
1872 hss
= kzalloc(sizeof(*hss
), GFP_NOFS
);
1876 hss
->hss_valid
= HSS_SETMASK
| HSS_ARCHIVE_ID
;
1877 hss
->hss_archive_id
= hui
->hui_archive_id
;
1878 hss
->hss_setmask
= HS_ARCHIVED
| HS_EXISTS
| HS_RELEASED
;
1879 rc
= ll_hsm_state_set(inode
, hss
);
1883 attr
= kzalloc(sizeof(*attr
), GFP_NOFS
);
1889 attr
->ia_mode
= hui
->hui_mode
& 0777;
1890 attr
->ia_mode
|= S_IFREG
;
1891 attr
->ia_uid
= make_kuid(&init_user_ns
, hui
->hui_uid
);
1892 attr
->ia_gid
= make_kgid(&init_user_ns
, hui
->hui_gid
);
1893 attr
->ia_size
= hui
->hui_size
;
1894 attr
->ia_mtime
.tv_sec
= hui
->hui_mtime
;
1895 attr
->ia_mtime
.tv_nsec
= hui
->hui_mtime_ns
;
1896 attr
->ia_atime
.tv_sec
= hui
->hui_atime
;
1897 attr
->ia_atime
.tv_nsec
= hui
->hui_atime_ns
;
1899 attr
->ia_valid
= ATTR_SIZE
| ATTR_MODE
| ATTR_FORCE
|
1900 ATTR_UID
| ATTR_GID
|
1901 ATTR_MTIME
| ATTR_MTIME_SET
|
1902 ATTR_ATIME
| ATTR_ATIME_SET
;
1906 rc
= ll_setattr_raw(file
->f_path
.dentry
, attr
, true);
1910 inode_unlock(inode
);
1918 static inline long ll_lease_type_from_fmode(fmode_t fmode
)
1920 return ((fmode
& FMODE_READ
) ? LL_LEASE_RDLCK
: 0) |
1921 ((fmode
& FMODE_WRITE
) ? LL_LEASE_WRLCK
: 0);
1925 ll_file_ioctl(struct file
*file
, unsigned int cmd
, unsigned long arg
)
1927 struct inode
*inode
= file_inode(file
);
1928 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1931 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p),cmd=%x\n",
1932 PFID(ll_inode2fid(inode
)), inode
, cmd
);
1933 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_IOCTL
, 1);
1935 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1936 if (_IOC_TYPE(cmd
) == 'T' || _IOC_TYPE(cmd
) == 't') /* tty ioctls */
1940 case LL_IOC_GETFLAGS
:
1941 /* Get the current value of the file flags */
1942 return put_user(fd
->fd_flags
, (int __user
*)arg
);
1943 case LL_IOC_SETFLAGS
:
1944 case LL_IOC_CLRFLAGS
:
1945 /* Set or clear specific file flags */
1946 /* XXX This probably needs checks to ensure the flags are
1947 * not abused, and to handle any flag side effects.
1949 if (get_user(flags
, (int __user
*)arg
))
1952 if (cmd
== LL_IOC_SETFLAGS
) {
1953 if ((flags
& LL_FILE_IGNORE_LOCK
) &&
1954 !(file
->f_flags
& O_DIRECT
)) {
1955 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
1960 fd
->fd_flags
|= flags
;
1962 fd
->fd_flags
&= ~flags
;
1965 case LL_IOC_LOV_SETSTRIPE
:
1966 return ll_lov_setstripe(inode
, file
, arg
);
1967 case LL_IOC_LOV_SETEA
:
1968 return ll_lov_setea(inode
, file
, arg
);
1969 case LL_IOC_LOV_SWAP_LAYOUTS
: {
1971 struct lustre_swap_layouts lsl
;
1973 if (copy_from_user(&lsl
, (char __user
*)arg
,
1974 sizeof(struct lustre_swap_layouts
)))
1977 if ((file
->f_flags
& O_ACCMODE
) == O_RDONLY
)
1980 file2
= fget(lsl
.sl_fd
);
1984 /* O_WRONLY or O_RDWR */
1985 if ((file2
->f_flags
& O_ACCMODE
) == O_RDONLY
) {
1990 if (lsl
.sl_flags
& SWAP_LAYOUTS_CLOSE
) {
1991 struct obd_client_handle
*och
= NULL
;
1992 struct ll_inode_info
*lli
;
1993 struct inode
*inode2
;
1995 if (lsl
.sl_flags
!= SWAP_LAYOUTS_CLOSE
) {
2000 lli
= ll_i2info(inode
);
2001 mutex_lock(&lli
->lli_och_mutex
);
2002 if (fd
->fd_lease_och
) {
2003 och
= fd
->fd_lease_och
;
2004 fd
->fd_lease_och
= NULL
;
2006 mutex_unlock(&lli
->lli_och_mutex
);
2011 inode2
= file_inode(file2
);
2012 rc
= ll_swap_layouts_close(och
, inode
, inode2
);
2014 rc
= ll_swap_layouts(file
, file2
, &lsl
);
2020 case LL_IOC_LOV_GETSTRIPE
:
2021 return ll_file_getstripe(inode
,
2022 (struct lov_user_md __user
*)arg
);
2023 case FSFILT_IOC_GETFLAGS
:
2024 case FSFILT_IOC_SETFLAGS
:
2025 return ll_iocontrol(inode
, file
, cmd
, arg
);
2026 case FSFILT_IOC_GETVERSION_OLD
:
2027 case FSFILT_IOC_GETVERSION
:
2028 return put_user(inode
->i_generation
, (int __user
*)arg
);
2029 case LL_IOC_GROUP_LOCK
:
2030 return ll_get_grouplock(inode
, file
, arg
);
2031 case LL_IOC_GROUP_UNLOCK
:
2032 return ll_put_grouplock(inode
, file
, arg
);
2033 case IOC_OBD_STATFS
:
2034 return ll_obd_statfs(inode
, (void __user
*)arg
);
2036 /* We need to special case any other ioctls we want to handle,
2037 * to send them to the MDS/OST as appropriate and to properly
2038 * network encode the arg field.
2039 case FSFILT_IOC_SETVERSION_OLD:
2040 case FSFILT_IOC_SETVERSION:
2042 case LL_IOC_FLUSHCTX
:
2043 return ll_flush_ctx(inode
);
2044 case LL_IOC_PATH2FID
: {
2045 if (copy_to_user((void __user
*)arg
, ll_inode2fid(inode
),
2046 sizeof(struct lu_fid
)))
2051 case LL_IOC_GETPARENT
:
2052 return ll_getparent(file
, (struct getparent __user
*)arg
);
2053 case OBD_IOC_FID2PATH
:
2054 return ll_fid2path(inode
, (void __user
*)arg
);
2055 case LL_IOC_DATA_VERSION
: {
2056 struct ioc_data_version idv
;
2059 if (copy_from_user(&idv
, (char __user
*)arg
, sizeof(idv
)))
2062 idv
.idv_flags
&= LL_DV_RD_FLUSH
| LL_DV_WR_FLUSH
;
2063 rc
= ll_data_version(inode
, &idv
.idv_version
, idv
.idv_flags
);
2064 if (rc
== 0 && copy_to_user((char __user
*)arg
, &idv
,
2071 case LL_IOC_GET_MDTIDX
: {
2074 mdtidx
= ll_get_mdt_idx(inode
);
2078 if (put_user(mdtidx
, (int __user
*)arg
))
2083 case OBD_IOC_GETDTNAME
:
2084 case OBD_IOC_GETMDNAME
:
2085 return ll_get_obd_name(inode
, cmd
, arg
);
2086 case LL_IOC_HSM_STATE_GET
: {
2087 struct md_op_data
*op_data
;
2088 struct hsm_user_state
*hus
;
2091 hus
= kzalloc(sizeof(*hus
), GFP_NOFS
);
2095 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
2096 LUSTRE_OPC_ANY
, hus
);
2097 if (IS_ERR(op_data
)) {
2099 return PTR_ERR(op_data
);
2102 rc
= obd_iocontrol(cmd
, ll_i2mdexp(inode
), sizeof(*op_data
),
2105 if (copy_to_user((void __user
*)arg
, hus
, sizeof(*hus
)))
2108 ll_finish_md_op_data(op_data
);
2112 case LL_IOC_HSM_STATE_SET
: {
2113 struct hsm_state_set
*hss
;
2116 hss
= memdup_user((char __user
*)arg
, sizeof(*hss
));
2118 return PTR_ERR(hss
);
2120 rc
= ll_hsm_state_set(inode
, hss
);
2125 case LL_IOC_HSM_ACTION
: {
2126 struct md_op_data
*op_data
;
2127 struct hsm_current_action
*hca
;
2130 hca
= kzalloc(sizeof(*hca
), GFP_NOFS
);
2134 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
2135 LUSTRE_OPC_ANY
, hca
);
2136 if (IS_ERR(op_data
)) {
2138 return PTR_ERR(op_data
);
2141 rc
= obd_iocontrol(cmd
, ll_i2mdexp(inode
), sizeof(*op_data
),
2144 if (copy_to_user((char __user
*)arg
, hca
, sizeof(*hca
)))
2147 ll_finish_md_op_data(op_data
);
2151 case LL_IOC_SET_LEASE
: {
2152 struct ll_inode_info
*lli
= ll_i2info(inode
);
2153 struct obd_client_handle
*och
= NULL
;
2158 case LL_LEASE_WRLCK
:
2159 if (!(file
->f_mode
& FMODE_WRITE
))
2161 fmode
= FMODE_WRITE
;
2163 case LL_LEASE_RDLCK
:
2164 if (!(file
->f_mode
& FMODE_READ
))
2168 case LL_LEASE_UNLCK
:
2169 mutex_lock(&lli
->lli_och_mutex
);
2170 if (fd
->fd_lease_och
) {
2171 och
= fd
->fd_lease_och
;
2172 fd
->fd_lease_och
= NULL
;
2174 mutex_unlock(&lli
->lli_och_mutex
);
2179 fmode
= och
->och_flags
;
2180 rc
= ll_lease_close(och
, inode
, &lease_broken
);
2187 return ll_lease_type_from_fmode(fmode
);
2192 CDEBUG(D_INODE
, "Set lease with mode %u\n", fmode
);
2194 /* apply for lease */
2195 och
= ll_lease_open(inode
, file
, fmode
, 0);
2197 return PTR_ERR(och
);
2200 mutex_lock(&lli
->lli_och_mutex
);
2201 if (!fd
->fd_lease_och
) {
2202 fd
->fd_lease_och
= och
;
2205 mutex_unlock(&lli
->lli_och_mutex
);
2207 /* impossible now that only excl is supported for now */
2208 ll_lease_close(och
, inode
, &lease_broken
);
2213 case LL_IOC_GET_LEASE
: {
2214 struct ll_inode_info
*lli
= ll_i2info(inode
);
2215 struct ldlm_lock
*lock
= NULL
;
2218 mutex_lock(&lli
->lli_och_mutex
);
2219 if (fd
->fd_lease_och
) {
2220 struct obd_client_handle
*och
= fd
->fd_lease_och
;
2222 lock
= ldlm_handle2lock(&och
->och_lease_handle
);
2224 lock_res_and_lock(lock
);
2225 if (!ldlm_is_cancel(lock
))
2226 fmode
= och
->och_flags
;
2227 unlock_res_and_lock(lock
);
2228 LDLM_LOCK_PUT(lock
);
2231 mutex_unlock(&lli
->lli_och_mutex
);
2232 return ll_lease_type_from_fmode(fmode
);
2234 case LL_IOC_HSM_IMPORT
: {
2235 struct hsm_user_import
*hui
;
2237 hui
= memdup_user((void __user
*)arg
, sizeof(*hui
));
2239 return PTR_ERR(hui
);
2241 rc
= ll_hsm_import(inode
, file
, hui
);
2249 if (ll_iocontrol_call(inode
, file
, cmd
, arg
, &err
) ==
2253 return obd_iocontrol(cmd
, ll_i2dtexp(inode
), 0, NULL
,
2254 (void __user
*)arg
);
2259 static loff_t
ll_file_seek(struct file
*file
, loff_t offset
, int origin
)
2261 struct inode
*inode
= file_inode(file
);
2262 loff_t retval
, eof
= 0;
2264 retval
= offset
+ ((origin
== SEEK_END
) ? i_size_read(inode
) :
2265 (origin
== SEEK_CUR
) ? file
->f_pos
: 0);
2266 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p), to=%llu=%#llx(%d)\n",
2267 PFID(ll_inode2fid(inode
)), inode
, retval
, retval
, origin
);
2268 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_LLSEEK
, 1);
2270 if (origin
== SEEK_END
|| origin
== SEEK_HOLE
|| origin
== SEEK_DATA
) {
2271 retval
= ll_glimpse_size(inode
);
2274 eof
= i_size_read(inode
);
2277 return generic_file_llseek_size(file
, offset
, origin
,
2278 ll_file_maxbytes(inode
), eof
);
2281 static int ll_flush(struct file
*file
, fl_owner_t id
)
2283 struct inode
*inode
= file_inode(file
);
2284 struct ll_inode_info
*lli
= ll_i2info(inode
);
2285 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
2288 LASSERT(!S_ISDIR(inode
->i_mode
));
2290 /* catch async errors that were recorded back when async writeback
2291 * failed for pages in this mapping.
2293 rc
= lli
->lli_async_rc
;
2294 lli
->lli_async_rc
= 0;
2295 if (lli
->lli_clob
) {
2296 err
= lov_read_and_clear_async_rc(lli
->lli_clob
);
2301 /* The application has been told about write failure already.
2302 * Do not report failure again.
2304 if (fd
->fd_write_failed
)
2306 return rc
? -EIO
: 0;
2310 * Called to make sure a portion of file has been written out.
2311 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2313 * Return how many pages have been written.
2315 int cl_sync_file_range(struct inode
*inode
, loff_t start
, loff_t end
,
2316 enum cl_fsync_mode mode
, int ignore_layout
)
2320 struct cl_fsync_io
*fio
;
2324 if (mode
!= CL_FSYNC_NONE
&& mode
!= CL_FSYNC_LOCAL
&&
2325 mode
!= CL_FSYNC_DISCARD
&& mode
!= CL_FSYNC_ALL
)
2328 env
= cl_env_get(&refcheck
);
2330 return PTR_ERR(env
);
2332 io
= vvp_env_thread_io(env
);
2333 io
->ci_obj
= ll_i2info(inode
)->lli_clob
;
2334 io
->ci_ignore_layout
= ignore_layout
;
2336 /* initialize parameters for sync */
2337 fio
= &io
->u
.ci_fsync
;
2338 fio
->fi_start
= start
;
2340 fio
->fi_fid
= ll_inode2fid(inode
);
2341 fio
->fi_mode
= mode
;
2342 fio
->fi_nr_written
= 0;
2344 if (cl_io_init(env
, io
, CIT_FSYNC
, io
->ci_obj
) == 0)
2345 result
= cl_io_loop(env
, io
);
2347 result
= io
->ci_result
;
2349 result
= fio
->fi_nr_written
;
2350 cl_io_fini(env
, io
);
2351 cl_env_put(env
, &refcheck
);
2356 int ll_fsync(struct file
*file
, loff_t start
, loff_t end
, int datasync
)
2358 struct inode
*inode
= file_inode(file
);
2359 struct ll_inode_info
*lli
= ll_i2info(inode
);
2360 struct ptlrpc_request
*req
;
2363 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p)\n",
2364 PFID(ll_inode2fid(inode
)), inode
);
2365 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_FSYNC
, 1);
2367 rc
= filemap_write_and_wait_range(inode
->i_mapping
, start
, end
);
2370 /* catch async errors that were recorded back when async writeback
2371 * failed for pages in this mapping.
2373 if (!S_ISDIR(inode
->i_mode
)) {
2374 err
= lli
->lli_async_rc
;
2375 lli
->lli_async_rc
= 0;
2378 if (lli
->lli_clob
) {
2379 err
= lov_read_and_clear_async_rc(lli
->lli_clob
);
2385 err
= md_sync(ll_i2sbi(inode
)->ll_md_exp
, ll_inode2fid(inode
), &req
);
2389 ptlrpc_req_finished(req
);
2391 if (S_ISREG(inode
->i_mode
)) {
2392 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
2394 err
= cl_sync_file_range(inode
, start
, end
, CL_FSYNC_ALL
, 0);
2395 if (rc
== 0 && err
< 0)
2398 fd
->fd_write_failed
= true;
2400 fd
->fd_write_failed
= false;
2403 inode_unlock(inode
);
2408 ll_file_flock(struct file
*file
, int cmd
, struct file_lock
*file_lock
)
2410 struct inode
*inode
= file_inode(file
);
2411 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
2412 struct ldlm_enqueue_info einfo
= {
2413 .ei_type
= LDLM_FLOCK
,
2414 .ei_cb_cp
= ldlm_flock_completion_ast
,
2415 .ei_cbdata
= file_lock
,
2417 struct md_op_data
*op_data
;
2418 struct lustre_handle lockh
= {0};
2419 union ldlm_policy_data flock
= { { 0 } };
2420 int fl_type
= file_lock
->fl_type
;
2425 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
" file_lock=%p\n",
2426 PFID(ll_inode2fid(inode
)), file_lock
);
2428 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_FLOCK
, 1);
2430 if (file_lock
->fl_flags
& FL_FLOCK
)
2431 LASSERT((cmd
== F_SETLKW
) || (cmd
== F_SETLK
));
2432 else if (!(file_lock
->fl_flags
& FL_POSIX
))
2435 flock
.l_flock
.owner
= (unsigned long)file_lock
->fl_owner
;
2436 flock
.l_flock
.pid
= file_lock
->fl_pid
;
2437 flock
.l_flock
.start
= file_lock
->fl_start
;
2438 flock
.l_flock
.end
= file_lock
->fl_end
;
2440 /* Somewhat ugly workaround for svc lockd.
2441 * lockd installs custom fl_lmops->lm_compare_owner that checks
2442 * for the fl_owner to be the same (which it always is on local node
2443 * I guess between lockd processes) and then compares pid.
2444 * As such we assign pid to the owner field to make it all work,
2445 * conflict with normal locks is unlikely since pid space and
2446 * pointer space for current->files are not intersecting
2448 if (file_lock
->fl_lmops
&& file_lock
->fl_lmops
->lm_compare_owner
)
2449 flock
.l_flock
.owner
= (unsigned long)file_lock
->fl_pid
;
2453 einfo
.ei_mode
= LCK_PR
;
2456 /* An unlock request may or may not have any relation to
2457 * existing locks so we may not be able to pass a lock handle
2458 * via a normal ldlm_lock_cancel() request. The request may even
2459 * unlock a byte range in the middle of an existing lock. In
2460 * order to process an unlock request we need all of the same
2461 * information that is given with a normal read or write record
2462 * lock request. To avoid creating another ldlm unlock (cancel)
2463 * message we'll treat a LCK_NL flock request as an unlock.
2465 einfo
.ei_mode
= LCK_NL
;
2468 einfo
.ei_mode
= LCK_PW
;
2471 CDEBUG(D_INFO
, "Unknown fcntl lock type: %d\n", fl_type
);
2486 flags
= LDLM_FL_BLOCK_NOWAIT
;
2492 flags
= LDLM_FL_TEST_LOCK
;
2495 CERROR("unknown fcntl lock command: %d\n", cmd
);
2500 * Save the old mode so that if the mode in the lock changes we
2501 * can decrement the appropriate reader or writer refcount.
2503 file_lock
->fl_type
= einfo
.ei_mode
;
2505 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
2506 LUSTRE_OPC_ANY
, NULL
);
2507 if (IS_ERR(op_data
))
2508 return PTR_ERR(op_data
);
2510 CDEBUG(D_DLMTRACE
, "inode=" DFID
", pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2511 PFID(ll_inode2fid(inode
)), flock
.l_flock
.pid
, flags
,
2512 einfo
.ei_mode
, flock
.l_flock
.start
, flock
.l_flock
.end
);
2514 rc
= md_enqueue(sbi
->ll_md_exp
, &einfo
, &flock
, NULL
, op_data
, &lockh
,
2517 /* Restore the file lock type if not TEST lock. */
2518 if (!(flags
& LDLM_FL_TEST_LOCK
))
2519 file_lock
->fl_type
= fl_type
;
2521 if ((rc
== 0 || file_lock
->fl_type
== F_UNLCK
) &&
2522 !(flags
& LDLM_FL_TEST_LOCK
))
2523 rc2
= locks_lock_file_wait(file
, file_lock
);
2525 if (rc2
&& file_lock
->fl_type
!= F_UNLCK
) {
2526 einfo
.ei_mode
= LCK_NL
;
2527 md_enqueue(sbi
->ll_md_exp
, &einfo
, &flock
, NULL
, op_data
,
2532 ll_finish_md_op_data(op_data
);
2537 int ll_get_fid_by_name(struct inode
*parent
, const char *name
,
2538 int namelen
, struct lu_fid
*fid
,
2539 struct inode
**inode
)
2541 struct md_op_data
*op_data
= NULL
;
2542 struct ptlrpc_request
*req
;
2543 struct mdt_body
*body
;
2546 op_data
= ll_prep_md_op_data(NULL
, parent
, NULL
, name
, namelen
, 0,
2547 LUSTRE_OPC_ANY
, NULL
);
2548 if (IS_ERR(op_data
))
2549 return PTR_ERR(op_data
);
2551 op_data
->op_valid
= OBD_MD_FLID
| OBD_MD_FLTYPE
;
2552 rc
= md_getattr_name(ll_i2sbi(parent
)->ll_md_exp
, op_data
, &req
);
2553 ll_finish_md_op_data(op_data
);
2557 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
2563 *fid
= body
->mbo_fid1
;
2566 rc
= ll_prep_inode(inode
, req
, parent
->i_sb
, NULL
);
2568 ptlrpc_req_finished(req
);
2572 int ll_migrate(struct inode
*parent
, struct file
*file
, int mdtidx
,
2573 const char *name
, int namelen
)
2575 struct ptlrpc_request
*request
= NULL
;
2576 struct obd_client_handle
*och
= NULL
;
2577 struct inode
*child_inode
= NULL
;
2578 struct dentry
*dchild
= NULL
;
2579 struct md_op_data
*op_data
;
2580 struct mdt_body
*body
;
2581 u64 data_version
= 0;
2585 CDEBUG(D_VFSTRACE
, "migrate %s under " DFID
" to MDT%d\n",
2586 name
, PFID(ll_inode2fid(parent
)), mdtidx
);
2588 op_data
= ll_prep_md_op_data(NULL
, parent
, NULL
, name
, namelen
,
2589 0, LUSTRE_OPC_ANY
, NULL
);
2590 if (IS_ERR(op_data
))
2591 return PTR_ERR(op_data
);
2593 /* Get child FID first */
2594 qstr
.hash
= full_name_hash(parent
, name
, namelen
);
2597 dchild
= d_lookup(file_dentry(file
), &qstr
);
2599 op_data
->op_fid3
= *ll_inode2fid(dchild
->d_inode
);
2600 if (dchild
->d_inode
)
2601 child_inode
= igrab(dchild
->d_inode
);
2606 rc
= ll_get_fid_by_name(parent
, name
, namelen
,
2607 &op_data
->op_fid3
, &child_inode
);
2617 inode_lock(child_inode
);
2618 op_data
->op_fid3
= *ll_inode2fid(child_inode
);
2619 if (!fid_is_sane(&op_data
->op_fid3
)) {
2620 CERROR("%s: migrate %s, but fid " DFID
" is insane\n",
2621 ll_get_fsname(parent
->i_sb
, NULL
, 0), name
,
2622 PFID(&op_data
->op_fid3
));
2627 rc
= ll_get_mdt_idx_by_fid(ll_i2sbi(parent
), &op_data
->op_fid3
);
2632 CDEBUG(D_INFO
, "%s: " DFID
" is already on MDT%d.\n", name
,
2633 PFID(&op_data
->op_fid3
), mdtidx
);
2638 if (S_ISREG(child_inode
->i_mode
)) {
2639 och
= ll_lease_open(child_inode
, NULL
, FMODE_WRITE
, 0);
2646 rc
= ll_data_version(child_inode
, &data_version
,
2651 op_data
->op_handle
= och
->och_fh
;
2652 op_data
->op_data
= och
->och_mod
;
2653 op_data
->op_data_version
= data_version
;
2654 op_data
->op_lease_handle
= och
->och_lease_handle
;
2655 op_data
->op_bias
|= MDS_RENAME_MIGRATE
;
2658 op_data
->op_mds
= mdtidx
;
2659 op_data
->op_cli_flags
= CLI_MIGRATE
;
2660 rc
= md_rename(ll_i2sbi(parent
)->ll_md_exp
, op_data
, name
,
2661 namelen
, name
, namelen
, &request
);
2664 ll_update_times(request
, parent
);
2666 body
= req_capsule_server_get(&request
->rq_pill
, &RMF_MDT_BODY
);
2670 * If the server does release layout lock, then we cleanup
2671 * the client och here, otherwise release it in out_close:
2673 if (och
&& body
->mbo_valid
& OBD_MD_CLOSE_INTENT_EXECED
) {
2674 obd_mod_put(och
->och_mod
);
2675 md_clear_open_replay_data(ll_i2sbi(parent
)->ll_md_exp
,
2677 och
->och_fh
.cookie
= DEAD_HANDLE_MAGIC
;
2684 ptlrpc_req_finished(request
);
2688 /* Try again if the file layout has changed. */
2689 if (rc
== -EAGAIN
&& S_ISREG(child_inode
->i_mode
))
2693 if (och
) /* close the file */
2694 ll_lease_close(och
, child_inode
, NULL
);
2696 clear_nlink(child_inode
);
2698 inode_unlock(child_inode
);
2701 ll_finish_md_op_data(op_data
);
2706 ll_file_noflock(struct file
*file
, int cmd
, struct file_lock
*file_lock
)
2712 * test if some locks matching bits and l_req_mode are acquired
2713 * - bits can be in different locks
2714 * - if found clear the common lock bits in *bits
2715 * - the bits not found, are kept in *bits
2717 * \param bits [IN] searched lock bits [IN]
2718 * \param l_req_mode [IN] searched lock mode
2719 * \retval boolean, true iff all bits are found
2721 int ll_have_md_lock(struct inode
*inode
, __u64
*bits
,
2722 enum ldlm_mode l_req_mode
)
2724 struct lustre_handle lockh
;
2725 union ldlm_policy_data policy
;
2726 enum ldlm_mode mode
= (l_req_mode
== LCK_MINMODE
) ?
2727 (LCK_CR
| LCK_CW
| LCK_PR
| LCK_PW
) : l_req_mode
;
2735 fid
= &ll_i2info(inode
)->lli_fid
;
2736 CDEBUG(D_INFO
, "trying to match res " DFID
" mode %s\n", PFID(fid
),
2737 ldlm_lockname
[mode
]);
2739 flags
= LDLM_FL_BLOCK_GRANTED
| LDLM_FL_CBPENDING
| LDLM_FL_TEST_LOCK
;
2740 for (i
= 0; i
<= MDS_INODELOCK_MAXSHIFT
&& *bits
!= 0; i
++) {
2741 policy
.l_inodebits
.bits
= *bits
& (1 << i
);
2742 if (policy
.l_inodebits
.bits
== 0)
2745 if (md_lock_match(ll_i2mdexp(inode
), flags
, fid
, LDLM_IBITS
,
2746 &policy
, mode
, &lockh
)) {
2747 struct ldlm_lock
*lock
;
2749 lock
= ldlm_handle2lock(&lockh
);
2752 ~(lock
->l_policy_data
.l_inodebits
.bits
);
2753 LDLM_LOCK_PUT(lock
);
2755 *bits
&= ~policy
.l_inodebits
.bits
;
2762 enum ldlm_mode
ll_take_md_lock(struct inode
*inode
, __u64 bits
,
2763 struct lustre_handle
*lockh
, __u64 flags
,
2764 enum ldlm_mode mode
)
2766 union ldlm_policy_data policy
= { .l_inodebits
= { bits
} };
2769 fid
= &ll_i2info(inode
)->lli_fid
;
2770 CDEBUG(D_INFO
, "trying to match res " DFID
"\n", PFID(fid
));
2772 return md_lock_match(ll_i2mdexp(inode
), flags
| LDLM_FL_BLOCK_GRANTED
,
2773 fid
, LDLM_IBITS
, &policy
, mode
, lockh
);
2776 static int ll_inode_revalidate_fini(struct inode
*inode
, int rc
)
2778 /* Already unlinked. Just update nlink and return success */
2779 if (rc
== -ENOENT
) {
2781 /* If it is striped directory, and there is bad stripe
2782 * Let's revalidate the dentry again, instead of returning
2785 if (S_ISDIR(inode
->i_mode
) && ll_i2info(inode
)->lli_lsm_md
)
2788 /* This path cannot be hit for regular files unless in
2789 * case of obscure races, so no need to validate size.
2791 if (!S_ISREG(inode
->i_mode
) && !S_ISDIR(inode
->i_mode
))
2793 } else if (rc
!= 0) {
2794 CDEBUG_LIMIT((rc
== -EACCES
|| rc
== -EIDRM
) ? D_INFO
: D_ERROR
,
2795 "%s: revalidate FID " DFID
" error: rc = %d\n",
2796 ll_get_fsname(inode
->i_sb
, NULL
, 0),
2797 PFID(ll_inode2fid(inode
)), rc
);
2803 static int __ll_inode_revalidate(struct dentry
*dentry
, __u64 ibits
)
2805 struct inode
*inode
= d_inode(dentry
);
2806 struct ptlrpc_request
*req
= NULL
;
2807 struct obd_export
*exp
;
2810 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p),name=%pd\n",
2811 PFID(ll_inode2fid(inode
)), inode
, dentry
);
2813 exp
= ll_i2mdexp(inode
);
2815 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2816 * But under CMD case, it caused some lock issues, should be fixed
2817 * with new CMD ibits lock. See bug 12718
2819 if (exp_connect_flags(exp
) & OBD_CONNECT_ATTRFID
) {
2820 struct lookup_intent oit
= { .it_op
= IT_GETATTR
};
2821 struct md_op_data
*op_data
;
2823 if (ibits
== MDS_INODELOCK_LOOKUP
)
2824 oit
.it_op
= IT_LOOKUP
;
2826 /* Call getattr by fid, so do not provide name at all. */
2827 op_data
= ll_prep_md_op_data(NULL
, inode
,
2829 LUSTRE_OPC_ANY
, NULL
);
2830 if (IS_ERR(op_data
))
2831 return PTR_ERR(op_data
);
2833 rc
= md_intent_lock(exp
, op_data
, &oit
, &req
,
2834 &ll_md_blocking_ast
, 0);
2835 ll_finish_md_op_data(op_data
);
2837 rc
= ll_inode_revalidate_fini(inode
, rc
);
2841 rc
= ll_revalidate_it_finish(req
, &oit
, inode
);
2843 ll_intent_release(&oit
);
2847 /* Unlinked? Unhash dentry, so it is not picked up later by
2848 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2849 * here to preserve get_cwd functionality on 2.6.
2852 if (!d_inode(dentry
)->i_nlink
) {
2853 spin_lock(&inode
->i_lock
);
2854 d_lustre_invalidate(dentry
, 0);
2855 spin_unlock(&inode
->i_lock
);
2858 ll_lookup_finish_locks(&oit
, inode
);
2859 } else if (!ll_have_md_lock(d_inode(dentry
), &ibits
, LCK_MINMODE
)) {
2860 struct ll_sb_info
*sbi
= ll_i2sbi(d_inode(dentry
));
2861 u64 valid
= OBD_MD_FLGETATTR
;
2862 struct md_op_data
*op_data
;
2865 if (S_ISREG(inode
->i_mode
)) {
2866 rc
= ll_get_default_mdsize(sbi
, &ealen
);
2869 valid
|= OBD_MD_FLEASIZE
| OBD_MD_FLMODEASIZE
;
2872 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
,
2873 0, ealen
, LUSTRE_OPC_ANY
,
2875 if (IS_ERR(op_data
))
2876 return PTR_ERR(op_data
);
2878 op_data
->op_valid
= valid
;
2879 rc
= md_getattr(sbi
->ll_md_exp
, op_data
, &req
);
2880 ll_finish_md_op_data(op_data
);
2882 return ll_inode_revalidate_fini(inode
, rc
);
2884 rc
= ll_prep_inode(&inode
, req
, NULL
, NULL
);
2887 ptlrpc_req_finished(req
);
2891 static int ll_merge_md_attr(struct inode
*inode
)
2893 struct cl_attr attr
= { 0 };
2896 LASSERT(ll_i2info(inode
)->lli_lsm_md
);
2897 rc
= md_merge_attr(ll_i2mdexp(inode
), ll_i2info(inode
)->lli_lsm_md
,
2898 &attr
, ll_md_blocking_ast
);
2902 set_nlink(inode
, attr
.cat_nlink
);
2903 inode
->i_blocks
= attr
.cat_blocks
;
2904 i_size_write(inode
, attr
.cat_size
);
2906 ll_i2info(inode
)->lli_atime
= attr
.cat_atime
;
2907 ll_i2info(inode
)->lli_mtime
= attr
.cat_mtime
;
2908 ll_i2info(inode
)->lli_ctime
= attr
.cat_ctime
;
2913 static int ll_inode_revalidate(struct dentry
*dentry
, __u64 ibits
)
2915 struct inode
*inode
= d_inode(dentry
);
2918 rc
= __ll_inode_revalidate(dentry
, ibits
);
2922 /* if object isn't regular file, don't validate size */
2923 if (!S_ISREG(inode
->i_mode
)) {
2924 if (S_ISDIR(inode
->i_mode
) &&
2925 ll_i2info(inode
)->lli_lsm_md
) {
2926 rc
= ll_merge_md_attr(inode
);
2931 LTIME_S(inode
->i_atime
) = ll_i2info(inode
)->lli_atime
;
2932 LTIME_S(inode
->i_mtime
) = ll_i2info(inode
)->lli_mtime
;
2933 LTIME_S(inode
->i_ctime
) = ll_i2info(inode
)->lli_ctime
;
2935 struct ll_inode_info
*lli
= ll_i2info(inode
);
2937 /* In case of restore, the MDT has the right size and has
2938 * already send it back without granting the layout lock,
2939 * inode is up-to-date so glimpse is useless.
2940 * Also to glimpse we need the layout, in case of a running
2941 * restore the MDT holds the layout lock so the glimpse will
2942 * block up to the end of restore (getattr will block)
2944 if (!test_bit(LLIF_FILE_RESTORING
, &lli
->lli_flags
))
2945 rc
= ll_glimpse_size(inode
);
2950 int ll_getattr(const struct path
*path
, struct kstat
*stat
,
2951 u32 request_mask
, unsigned int flags
)
2953 struct inode
*inode
= d_inode(path
->dentry
);
2954 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
2955 struct ll_inode_info
*lli
= ll_i2info(inode
);
2958 res
= ll_inode_revalidate(path
->dentry
,
2959 MDS_INODELOCK_UPDATE
| MDS_INODELOCK_LOOKUP
);
2960 ll_stats_ops_tally(sbi
, LPROC_LL_GETATTR
, 1);
2965 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY
, 30);
2967 stat
->dev
= inode
->i_sb
->s_dev
;
2968 if (ll_need_32bit_api(sbi
))
2969 stat
->ino
= cl_fid_build_ino(&lli
->lli_fid
, 1);
2971 stat
->ino
= inode
->i_ino
;
2972 stat
->mode
= inode
->i_mode
;
2973 stat
->uid
= inode
->i_uid
;
2974 stat
->gid
= inode
->i_gid
;
2975 stat
->rdev
= inode
->i_rdev
;
2976 stat
->atime
= inode
->i_atime
;
2977 stat
->mtime
= inode
->i_mtime
;
2978 stat
->ctime
= inode
->i_ctime
;
2979 stat
->blksize
= 1 << inode
->i_blkbits
;
2981 stat
->nlink
= inode
->i_nlink
;
2982 stat
->size
= i_size_read(inode
);
2983 stat
->blocks
= inode
->i_blocks
;
2988 static int ll_fiemap(struct inode
*inode
, struct fiemap_extent_info
*fieinfo
,
2989 __u64 start
, __u64 len
)
2993 struct fiemap
*fiemap
;
2994 unsigned int extent_count
= fieinfo
->fi_extents_max
;
2996 num_bytes
= sizeof(*fiemap
) + (extent_count
*
2997 sizeof(struct fiemap_extent
));
2998 fiemap
= libcfs_kvzalloc(num_bytes
, GFP_NOFS
);
3002 fiemap
->fm_flags
= fieinfo
->fi_flags
;
3003 fiemap
->fm_extent_count
= fieinfo
->fi_extents_max
;
3004 fiemap
->fm_start
= start
;
3005 fiemap
->fm_length
= len
;
3007 if (extent_count
> 0 &&
3008 copy_from_user(&fiemap
->fm_extents
[0], fieinfo
->fi_extents_start
,
3009 sizeof(struct fiemap_extent
))) {
3014 rc
= ll_do_fiemap(inode
, fiemap
, num_bytes
);
3016 fieinfo
->fi_flags
= fiemap
->fm_flags
;
3017 fieinfo
->fi_extents_mapped
= fiemap
->fm_mapped_extents
;
3018 if (extent_count
> 0 &&
3019 copy_to_user(fieinfo
->fi_extents_start
, &fiemap
->fm_extents
[0],
3020 fiemap
->fm_mapped_extents
*
3021 sizeof(struct fiemap_extent
))) {
3030 struct posix_acl
*ll_get_acl(struct inode
*inode
, int type
)
3032 struct ll_inode_info
*lli
= ll_i2info(inode
);
3033 struct posix_acl
*acl
= NULL
;
3035 spin_lock(&lli
->lli_lock
);
3036 /* VFS' acl_permission_check->check_acl will release the refcount */
3037 acl
= posix_acl_dup(lli
->lli_posix_acl
);
3038 #ifdef CONFIG_FS_POSIX_ACL
3039 forget_cached_acl(inode
, type
);
3041 spin_unlock(&lli
->lli_lock
);
3046 int ll_inode_permission(struct inode
*inode
, int mask
)
3048 struct ll_sb_info
*sbi
;
3049 struct root_squash_info
*squash
;
3050 const struct cred
*old_cred
= NULL
;
3051 struct cred
*cred
= NULL
;
3052 bool squash_id
= false;
3056 if (mask
& MAY_NOT_BLOCK
)
3059 /* as root inode are NOT getting validated in lookup operation,
3060 * need to do it before permission check.
3063 if (is_root_inode(inode
)) {
3064 rc
= __ll_inode_revalidate(inode
->i_sb
->s_root
,
3065 MDS_INODELOCK_LOOKUP
);
3070 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p), inode mode %x mask %o\n",
3071 PFID(ll_inode2fid(inode
)), inode
, inode
->i_mode
, mask
);
3073 /* squash fsuid/fsgid if needed */
3074 sbi
= ll_i2sbi(inode
);
3075 squash
= &sbi
->ll_squash
;
3076 if (unlikely(squash
->rsi_uid
&&
3077 uid_eq(current_fsuid(), GLOBAL_ROOT_UID
) &&
3078 !(sbi
->ll_flags
& LL_SBI_NOROOTSQUASH
))) {
3083 CDEBUG(D_OTHER
, "squash creds (%d:%d)=>(%d:%d)\n",
3084 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3085 squash
->rsi_uid
, squash
->rsi_gid
);
3088 * update current process's credentials
3091 cred
= prepare_creds();
3095 cred
->fsuid
= make_kuid(&init_user_ns
, squash
->rsi_uid
);
3096 cred
->fsgid
= make_kgid(&init_user_ns
, squash
->rsi_gid
);
3097 for (cap
= 0; cap
< sizeof(cfs_cap_t
) * 8; cap
++) {
3098 if ((1 << cap
) & CFS_CAP_FS_MASK
)
3099 cap_lower(cred
->cap_effective
, cap
);
3101 old_cred
= override_creds(cred
);
3104 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_INODE_PERM
, 1);
3105 rc
= generic_permission(inode
, mask
);
3107 /* restore current process's credentials and FS capability */
3109 revert_creds(old_cred
);
3116 /* -o localflock - only provides locally consistent flock locks */
3117 const struct file_operations ll_file_operations
= {
3118 .read_iter
= ll_file_read_iter
,
3119 .write_iter
= ll_file_write_iter
,
3120 .unlocked_ioctl
= ll_file_ioctl
,
3121 .open
= ll_file_open
,
3122 .release
= ll_file_release
,
3123 .mmap
= ll_file_mmap
,
3124 .llseek
= ll_file_seek
,
3125 .splice_read
= generic_file_splice_read
,
3130 const struct file_operations ll_file_operations_flock
= {
3131 .read_iter
= ll_file_read_iter
,
3132 .write_iter
= ll_file_write_iter
,
3133 .unlocked_ioctl
= ll_file_ioctl
,
3134 .open
= ll_file_open
,
3135 .release
= ll_file_release
,
3136 .mmap
= ll_file_mmap
,
3137 .llseek
= ll_file_seek
,
3138 .splice_read
= generic_file_splice_read
,
3141 .flock
= ll_file_flock
,
3142 .lock
= ll_file_flock
3145 /* These are for -o noflock - to return ENOSYS on flock calls */
3146 const struct file_operations ll_file_operations_noflock
= {
3147 .read_iter
= ll_file_read_iter
,
3148 .write_iter
= ll_file_write_iter
,
3149 .unlocked_ioctl
= ll_file_ioctl
,
3150 .open
= ll_file_open
,
3151 .release
= ll_file_release
,
3152 .mmap
= ll_file_mmap
,
3153 .llseek
= ll_file_seek
,
3154 .splice_read
= generic_file_splice_read
,
3157 .flock
= ll_file_noflock
,
3158 .lock
= ll_file_noflock
3161 const struct inode_operations ll_file_inode_operations
= {
3162 .setattr
= ll_setattr
,
3163 .getattr
= ll_getattr
,
3164 .permission
= ll_inode_permission
,
3165 .listxattr
= ll_listxattr
,
3166 .fiemap
= ll_fiemap
,
3167 .get_acl
= ll_get_acl
,
3170 /* dynamic ioctl number support routines */
3171 static struct llioc_ctl_data
{
3172 struct rw_semaphore ioc_sem
;
3173 struct list_head ioc_head
;
3175 __RWSEM_INITIALIZER(llioc
.ioc_sem
),
3176 LIST_HEAD_INIT(llioc
.ioc_head
)
3180 struct list_head iocd_list
;
3181 unsigned int iocd_size
;
3182 llioc_callback_t iocd_cb
;
3183 unsigned int iocd_count
;
3184 unsigned int iocd_cmd
[0];
3187 void *ll_iocontrol_register(llioc_callback_t cb
, int count
, unsigned int *cmd
)
3190 struct llioc_data
*in_data
= NULL
;
3192 if (!cb
|| !cmd
|| count
> LLIOC_MAX_CMD
|| count
< 0)
3195 size
= sizeof(*in_data
) + count
* sizeof(unsigned int);
3196 in_data
= kzalloc(size
, GFP_NOFS
);
3200 in_data
->iocd_size
= size
;
3201 in_data
->iocd_cb
= cb
;
3202 in_data
->iocd_count
= count
;
3203 memcpy(in_data
->iocd_cmd
, cmd
, sizeof(unsigned int) * count
);
3205 down_write(&llioc
.ioc_sem
);
3206 list_add_tail(&in_data
->iocd_list
, &llioc
.ioc_head
);
3207 up_write(&llioc
.ioc_sem
);
3211 EXPORT_SYMBOL(ll_iocontrol_register
);
3213 void ll_iocontrol_unregister(void *magic
)
3215 struct llioc_data
*tmp
;
3220 down_write(&llioc
.ioc_sem
);
3221 list_for_each_entry(tmp
, &llioc
.ioc_head
, iocd_list
) {
3223 list_del(&tmp
->iocd_list
);
3224 up_write(&llioc
.ioc_sem
);
3230 up_write(&llioc
.ioc_sem
);
3232 CWARN("didn't find iocontrol register block with magic: %p\n", magic
);
3234 EXPORT_SYMBOL(ll_iocontrol_unregister
);
3236 static enum llioc_iter
3237 ll_iocontrol_call(struct inode
*inode
, struct file
*file
,
3238 unsigned int cmd
, unsigned long arg
, int *rcp
)
3240 enum llioc_iter ret
= LLIOC_CONT
;
3241 struct llioc_data
*data
;
3242 int rc
= -EINVAL
, i
;
3244 down_read(&llioc
.ioc_sem
);
3245 list_for_each_entry(data
, &llioc
.ioc_head
, iocd_list
) {
3246 for (i
= 0; i
< data
->iocd_count
; i
++) {
3247 if (cmd
!= data
->iocd_cmd
[i
])
3250 ret
= data
->iocd_cb(inode
, file
, cmd
, arg
, data
, &rc
);
3254 if (ret
== LLIOC_STOP
)
3257 up_read(&llioc
.ioc_sem
);
3264 int ll_layout_conf(struct inode
*inode
, const struct cl_object_conf
*conf
)
3266 struct ll_inode_info
*lli
= ll_i2info(inode
);
3267 struct cl_object
*obj
= lli
->lli_clob
;
3275 env
= cl_env_get(&refcheck
);
3277 return PTR_ERR(env
);
3279 rc
= cl_conf_set(env
, obj
, conf
);
3283 if (conf
->coc_opc
== OBJECT_CONF_SET
) {
3284 struct ldlm_lock
*lock
= conf
->coc_lock
;
3285 struct cl_layout cl
= {
3290 LASSERT(ldlm_has_layout(lock
));
3292 /* it can only be allowed to match after layout is
3293 * applied to inode otherwise false layout would be
3294 * seen. Applying layout should happen before dropping
3297 ldlm_lock_allow_match(lock
);
3299 rc
= cl_object_layout_get(env
, obj
, &cl
);
3303 CDEBUG(D_VFSTRACE
, DFID
": layout version change: %u -> %u\n",
3304 PFID(&lli
->lli_fid
), ll_layout_version_get(lli
),
3306 ll_layout_version_set(lli
, cl
.cl_layout_gen
);
3309 cl_env_put(env
, &refcheck
);
3313 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3314 static int ll_layout_fetch(struct inode
*inode
, struct ldlm_lock
*lock
)
3317 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
3318 struct ptlrpc_request
*req
;
3319 struct mdt_body
*body
;
3325 CDEBUG(D_INODE
, DFID
" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3326 PFID(ll_inode2fid(inode
)), ldlm_is_lvb_ready(lock
),
3327 lock
->l_lvb_data
, lock
->l_lvb_len
);
3329 if (lock
->l_lvb_data
&& ldlm_is_lvb_ready(lock
))
3332 /* if layout lock was granted right away, the layout is returned
3333 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3334 * blocked and then granted via completion ast, we have to fetch
3335 * layout here. Please note that we can't use the LVB buffer in
3336 * completion AST because it doesn't have a large enough buffer
3338 rc
= ll_get_default_mdsize(sbi
, &lmmsize
);
3340 rc
= md_getxattr(sbi
->ll_md_exp
, ll_inode2fid(inode
),
3341 OBD_MD_FLXATTR
, XATTR_NAME_LOV
, NULL
, 0,
3346 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
3352 lmmsize
= body
->mbo_eadatasize
;
3353 if (lmmsize
== 0) /* empty layout */ {
3358 lmm
= req_capsule_server_sized_get(&req
->rq_pill
, &RMF_EADATA
, lmmsize
);
3364 lvbdata
= libcfs_kvzalloc(lmmsize
, GFP_NOFS
);
3370 memcpy(lvbdata
, lmm
, lmmsize
);
3371 lock_res_and_lock(lock
);
3372 if (lock
->l_lvb_data
)
3373 kvfree(lock
->l_lvb_data
);
3375 lock
->l_lvb_data
= lvbdata
;
3376 lock
->l_lvb_len
= lmmsize
;
3377 unlock_res_and_lock(lock
);
3380 ptlrpc_req_finished(req
);
3385 * Apply the layout to the inode. Layout lock is held and will be released
3388 static int ll_layout_lock_set(struct lustre_handle
*lockh
, enum ldlm_mode mode
,
3389 struct inode
*inode
)
3391 struct ll_inode_info
*lli
= ll_i2info(inode
);
3392 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
3393 struct ldlm_lock
*lock
;
3394 struct cl_object_conf conf
;
3397 bool wait_layout
= false;
3399 LASSERT(lustre_handle_is_used(lockh
));
3401 lock
= ldlm_handle2lock(lockh
);
3403 LASSERT(ldlm_has_layout(lock
));
3405 LDLM_DEBUG(lock
, "File " DFID
"(%p) being reconfigured",
3406 PFID(&lli
->lli_fid
), inode
);
3408 /* in case this is a caching lock and reinstate with new inode */
3409 md_set_lock_data(sbi
->ll_md_exp
, lockh
, inode
, NULL
);
3411 lock_res_and_lock(lock
);
3412 lvb_ready
= ldlm_is_lvb_ready(lock
);
3413 unlock_res_and_lock(lock
);
3414 /* checking lvb_ready is racy but this is okay. The worst case is
3415 * that multi processes may configure the file on the same time.
3422 rc
= ll_layout_fetch(inode
, lock
);
3426 /* for layout lock, lmm is returned in lock's lvb.
3427 * lvb_data is immutable if the lock is held so it's safe to access it
3430 * set layout to file. Unlikely this will fail as old layout was
3433 memset(&conf
, 0, sizeof(conf
));
3434 conf
.coc_opc
= OBJECT_CONF_SET
;
3435 conf
.coc_inode
= inode
;
3436 conf
.coc_lock
= lock
;
3437 conf
.u
.coc_layout
.lb_buf
= lock
->l_lvb_data
;
3438 conf
.u
.coc_layout
.lb_len
= lock
->l_lvb_len
;
3439 rc
= ll_layout_conf(inode
, &conf
);
3441 /* refresh layout failed, need to wait */
3442 wait_layout
= rc
== -EBUSY
;
3445 LDLM_LOCK_PUT(lock
);
3446 ldlm_lock_decref(lockh
, mode
);
3448 /* wait for IO to complete if it's still being used. */
3450 CDEBUG(D_INODE
, "%s: " DFID
"(%p) wait for layout reconf\n",
3451 ll_get_fsname(inode
->i_sb
, NULL
, 0),
3452 PFID(&lli
->lli_fid
), inode
);
3454 memset(&conf
, 0, sizeof(conf
));
3455 conf
.coc_opc
= OBJECT_CONF_WAIT
;
3456 conf
.coc_inode
= inode
;
3457 rc
= ll_layout_conf(inode
, &conf
);
3461 CDEBUG(D_INODE
, "%s: file=" DFID
" waiting layout return: %d.\n",
3462 ll_get_fsname(inode
->i_sb
, NULL
, 0),
3463 PFID(&lli
->lli_fid
), rc
);
3468 static int ll_layout_refresh_locked(struct inode
*inode
)
3470 struct ll_inode_info
*lli
= ll_i2info(inode
);
3471 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
3472 struct md_op_data
*op_data
;
3473 struct lookup_intent it
;
3474 struct lustre_handle lockh
;
3475 enum ldlm_mode mode
;
3476 struct ldlm_enqueue_info einfo
= {
3477 .ei_type
= LDLM_IBITS
,
3479 .ei_cb_bl
= &ll_md_blocking_ast
,
3480 .ei_cb_cp
= &ldlm_completion_ast
,
3485 /* mostly layout lock is caching on the local side, so try to match
3486 * it before grabbing layout lock mutex.
3488 mode
= ll_take_md_lock(inode
, MDS_INODELOCK_LAYOUT
, &lockh
, 0,
3489 LCK_CR
| LCK_CW
| LCK_PR
| LCK_PW
);
3490 if (mode
!= 0) { /* hit cached lock */
3491 rc
= ll_layout_lock_set(&lockh
, mode
, inode
);
3497 op_data
= ll_prep_md_op_data(NULL
, inode
, inode
, NULL
,
3498 0, 0, LUSTRE_OPC_ANY
, NULL
);
3499 if (IS_ERR(op_data
))
3500 return PTR_ERR(op_data
);
3502 /* have to enqueue one */
3503 memset(&it
, 0, sizeof(it
));
3504 it
.it_op
= IT_LAYOUT
;
3505 lockh
.cookie
= 0ULL;
3507 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file " DFID
"(%p)",
3508 ll_get_fsname(inode
->i_sb
, NULL
, 0),
3509 PFID(&lli
->lli_fid
), inode
);
3511 rc
= md_enqueue(sbi
->ll_md_exp
, &einfo
, NULL
, &it
, op_data
, &lockh
, 0);
3512 ptlrpc_req_finished(it
.it_request
);
3513 it
.it_request
= NULL
;
3515 ll_finish_md_op_data(op_data
);
3517 mode
= it
.it_lock_mode
;
3518 it
.it_lock_mode
= 0;
3519 ll_intent_drop_lock(&it
);
3522 /* set lock data in case this is a new lock */
3523 ll_set_lock_data(sbi
->ll_md_exp
, inode
, &it
, NULL
);
3524 rc
= ll_layout_lock_set(&lockh
, mode
, inode
);
3533 * This function checks if there exists a LAYOUT lock on the client side,
3534 * or enqueues it if it doesn't have one in cache.
3536 * This function will not hold layout lock so it may be revoked any time after
3537 * this function returns. Any operations depend on layout should be redone
3540 * This function should be called before lov_io_init() to get an uptodate
3541 * layout version, the caller should save the version number and after IO
3542 * is finished, this function should be called again to verify that layout
3543 * is not changed during IO time.
3545 int ll_layout_refresh(struct inode
*inode
, __u32
*gen
)
3547 struct ll_inode_info
*lli
= ll_i2info(inode
);
3548 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
3551 *gen
= ll_layout_version_get(lli
);
3552 if (!(sbi
->ll_flags
& LL_SBI_LAYOUT_LOCK
) || *gen
!= CL_LAYOUT_GEN_NONE
)
3556 LASSERT(fid_is_sane(ll_inode2fid(inode
)));
3557 LASSERT(S_ISREG(inode
->i_mode
));
3559 /* take layout lock mutex to enqueue layout lock exclusively. */
3560 mutex_lock(&lli
->lli_layout_mutex
);
3562 rc
= ll_layout_refresh_locked(inode
);
3566 *gen
= ll_layout_version_get(lli
);
3568 mutex_unlock(&lli
->lli_layout_mutex
);
3574 * This function send a restore request to the MDT
3576 int ll_layout_restore(struct inode
*inode
, loff_t offset
, __u64 length
)
3578 struct hsm_user_request
*hur
;
3581 len
= sizeof(struct hsm_user_request
) +
3582 sizeof(struct hsm_user_item
);
3583 hur
= kzalloc(len
, GFP_NOFS
);
3587 hur
->hur_request
.hr_action
= HUA_RESTORE
;
3588 hur
->hur_request
.hr_archive_id
= 0;
3589 hur
->hur_request
.hr_flags
= 0;
3590 memcpy(&hur
->hur_user_item
[0].hui_fid
, &ll_i2info(inode
)->lli_fid
,
3591 sizeof(hur
->hur_user_item
[0].hui_fid
));
3592 hur
->hur_user_item
[0].hui_extent
.offset
= offset
;
3593 hur
->hur_user_item
[0].hui_extent
.length
= length
;
3594 hur
->hur_request
.hr_itemcount
= 1;
3595 rc
= obd_iocontrol(LL_IOC_HSM_REQUEST
, ll_i2sbi(inode
)->ll_md_exp
,