4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data
*ll_file_data_get(void)
55 struct ll_file_data
*fd
;
57 OBD_SLAB_ALLOC_PTR_GFP(fd
, ll_file_data_slab
, __GFP_IO
);
60 fd
->fd_write_failed
= false;
64 static void ll_file_data_put(struct ll_file_data
*fd
)
67 OBD_SLAB_FREE_PTR(fd
, ll_file_data_slab
);
70 void ll_pack_inode2opdata(struct inode
*inode
, struct md_op_data
*op_data
,
71 struct lustre_handle
*fh
)
73 op_data
->op_fid1
= ll_i2info(inode
)->lli_fid
;
74 op_data
->op_attr
.ia_mode
= inode
->i_mode
;
75 op_data
->op_attr
.ia_atime
= inode
->i_atime
;
76 op_data
->op_attr
.ia_mtime
= inode
->i_mtime
;
77 op_data
->op_attr
.ia_ctime
= inode
->i_ctime
;
78 op_data
->op_attr
.ia_size
= i_size_read(inode
);
79 op_data
->op_attr_blocks
= inode
->i_blocks
;
80 ((struct ll_iattr
*)&op_data
->op_attr
)->ia_attr_flags
=
81 ll_inode_to_ext_flags(inode
->i_flags
);
82 op_data
->op_ioepoch
= ll_i2info(inode
)->lli_ioepoch
;
84 op_data
->op_handle
= *fh
;
85 op_data
->op_capa1
= ll_mdscapa_get(inode
);
87 if (LLIF_DATA_MODIFIED
& ll_i2info(inode
)->lli_flags
)
88 op_data
->op_bias
|= MDS_DATA_MODIFIED
;
92 * Closes the IO epoch and packs all the attributes into @op_data for
95 static void ll_prepare_close(struct inode
*inode
, struct md_op_data
*op_data
,
96 struct obd_client_handle
*och
)
98 op_data
->op_attr
.ia_valid
= ATTR_MODE
| ATTR_ATIME
| ATTR_ATIME_SET
|
99 ATTR_MTIME
| ATTR_MTIME_SET
|
100 ATTR_CTIME
| ATTR_CTIME_SET
;
102 if (!(och
->och_flags
& FMODE_WRITE
))
105 if (!exp_connect_som(ll_i2mdexp(inode
)) || !S_ISREG(inode
->i_mode
))
106 op_data
->op_attr
.ia_valid
|= ATTR_SIZE
| ATTR_BLOCKS
;
108 ll_ioepoch_close(inode
, op_data
, &och
, 0);
111 ll_pack_inode2opdata(inode
, op_data
, &och
->och_fh
);
112 ll_prep_md_op_data(op_data
, inode
, NULL
, NULL
,
113 0, 0, LUSTRE_OPC_ANY
, NULL
);
116 static int ll_close_inode_openhandle(struct obd_export
*md_exp
,
118 struct obd_client_handle
*och
)
120 struct obd_export
*exp
= ll_i2mdexp(inode
);
121 struct md_op_data
*op_data
;
122 struct ptlrpc_request
*req
= NULL
;
123 struct obd_device
*obd
= class_exp2obd(exp
);
129 * XXX: in case of LMV, is this correct to access
132 CERROR("Invalid MDC connection handle "LPX64
"\n",
133 ll_i2mdexp(inode
)->exp_handle
.h_cookie
);
137 OBD_ALLOC_PTR(op_data
);
139 GOTO(out
, rc
= -ENOMEM
); // XXX We leak openhandle and request here.
141 ll_prepare_close(inode
, op_data
, och
);
142 epoch_close
= (op_data
->op_flags
& MF_EPOCH_CLOSE
);
143 rc
= md_close(md_exp
, op_data
, och
->och_mod
, &req
);
145 /* This close must have the epoch closed. */
146 LASSERT(epoch_close
);
147 /* MDS has instructed us to obtain Size-on-MDS attribute from
148 * OSTs and send setattr to back to MDS. */
149 rc
= ll_som_update(inode
, op_data
);
151 CERROR("inode %lu mdc Size-on-MDS update failed: "
152 "rc = %d\n", inode
->i_ino
, rc
);
156 CERROR("inode %lu mdc close failed: rc = %d\n",
160 /* DATA_MODIFIED flag was successfully sent on close, cancel data
161 * modification flag. */
162 if (rc
== 0 && (op_data
->op_bias
& MDS_DATA_MODIFIED
)) {
163 struct ll_inode_info
*lli
= ll_i2info(inode
);
165 spin_lock(&lli
->lli_lock
);
166 lli
->lli_flags
&= ~LLIF_DATA_MODIFIED
;
167 spin_unlock(&lli
->lli_lock
);
170 ll_finish_md_op_data(op_data
);
173 rc
= ll_objects_destroy(req
, inode
);
175 CERROR("inode %lu ll_objects destroy: rc = %d\n",
180 if (exp_connect_som(exp
) && !epoch_close
&&
181 S_ISREG(inode
->i_mode
) && (och
->och_flags
& FMODE_WRITE
)) {
182 ll_queue_done_writing(inode
, LLIF_DONE_WRITING
);
184 md_clear_open_replay_data(md_exp
, och
);
185 /* Free @och if it is not waiting for DONE_WRITING. */
186 och
->och_fh
.cookie
= DEAD_HANDLE_MAGIC
;
189 if (req
) /* This is close request */
190 ptlrpc_req_finished(req
);
194 int ll_md_real_close(struct inode
*inode
, int flags
)
196 struct ll_inode_info
*lli
= ll_i2info(inode
);
197 struct obd_client_handle
**och_p
;
198 struct obd_client_handle
*och
;
202 if (flags
& FMODE_WRITE
) {
203 och_p
= &lli
->lli_mds_write_och
;
204 och_usecount
= &lli
->lli_open_fd_write_count
;
205 } else if (flags
& FMODE_EXEC
) {
206 och_p
= &lli
->lli_mds_exec_och
;
207 och_usecount
= &lli
->lli_open_fd_exec_count
;
209 LASSERT(flags
& FMODE_READ
);
210 och_p
= &lli
->lli_mds_read_och
;
211 och_usecount
= &lli
->lli_open_fd_read_count
;
214 mutex_lock(&lli
->lli_och_mutex
);
215 if (*och_usecount
) { /* There are still users of this handle, so
217 mutex_unlock(&lli
->lli_och_mutex
);
222 mutex_unlock(&lli
->lli_och_mutex
);
224 if (och
) { /* There might be a race and somebody have freed this och
226 rc
= ll_close_inode_openhandle(ll_i2sbi(inode
)->ll_md_exp
,
233 int ll_md_close(struct obd_export
*md_exp
, struct inode
*inode
,
236 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
237 struct ll_inode_info
*lli
= ll_i2info(inode
);
240 /* clear group lock, if present */
241 if (unlikely(fd
->fd_flags
& LL_FILE_GROUP_LOCKED
))
242 ll_put_grouplock(inode
, file
, fd
->fd_grouplock
.cg_gid
);
244 /* Let's see if we have good enough OPEN lock on the file and if
245 we can skip talking to MDS */
246 if (file
->f_dentry
->d_inode
) { /* Can this ever be false? */
248 int flags
= LDLM_FL_BLOCK_GRANTED
| LDLM_FL_TEST_LOCK
;
249 struct lustre_handle lockh
;
250 struct inode
*inode
= file
->f_dentry
->d_inode
;
251 ldlm_policy_data_t policy
= {.l_inodebits
={MDS_INODELOCK_OPEN
}};
253 mutex_lock(&lli
->lli_och_mutex
);
254 if (fd
->fd_omode
& FMODE_WRITE
) {
256 LASSERT(lli
->lli_open_fd_write_count
);
257 lli
->lli_open_fd_write_count
--;
258 } else if (fd
->fd_omode
& FMODE_EXEC
) {
260 LASSERT(lli
->lli_open_fd_exec_count
);
261 lli
->lli_open_fd_exec_count
--;
264 LASSERT(lli
->lli_open_fd_read_count
);
265 lli
->lli_open_fd_read_count
--;
267 mutex_unlock(&lli
->lli_och_mutex
);
269 if (!md_lock_match(md_exp
, flags
, ll_inode2fid(inode
),
270 LDLM_IBITS
, &policy
, lockmode
,
272 rc
= ll_md_real_close(file
->f_dentry
->d_inode
,
276 CERROR("Releasing a file %p with negative dentry %p. Name %s",
277 file
, file
->f_dentry
, file
->f_dentry
->d_name
.name
);
280 LUSTRE_FPRIVATE(file
) = NULL
;
281 ll_file_data_put(fd
);
282 ll_capa_close(inode
);
287 /* While this returns an error code, fput() the caller does not, so we need
288 * to make every effort to clean up all of our state here. Also, applications
289 * rarely check close errors and even if an error is returned they will not
290 * re-try the close call.
292 int ll_file_release(struct inode
*inode
, struct file
*file
)
294 struct ll_file_data
*fd
;
295 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
296 struct ll_inode_info
*lli
= ll_i2info(inode
);
299 CDEBUG(D_VFSTRACE
, "VFS Op:inode=%lu/%u(%p)\n", inode
->i_ino
,
300 inode
->i_generation
, inode
);
302 #ifdef CONFIG_FS_POSIX_ACL
303 if (sbi
->ll_flags
& LL_SBI_RMT_CLIENT
&&
304 inode
== inode
->i_sb
->s_root
->d_inode
) {
305 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
308 if (unlikely(fd
->fd_flags
& LL_FILE_RMTACL
)) {
309 fd
->fd_flags
&= ~LL_FILE_RMTACL
;
310 rct_del(&sbi
->ll_rct
, current_pid());
311 et_search_free(&sbi
->ll_et
, current_pid());
316 if (inode
->i_sb
->s_root
!= file
->f_dentry
)
317 ll_stats_ops_tally(sbi
, LPROC_LL_RELEASE
, 1);
318 fd
= LUSTRE_FPRIVATE(file
);
321 /* The last ref on @file, maybe not the the owner pid of statahead.
322 * Different processes can open the same dir, "ll_opendir_key" means:
323 * it is me that should stop the statahead thread. */
324 if (S_ISDIR(inode
->i_mode
) && lli
->lli_opendir_key
== fd
&&
325 lli
->lli_opendir_pid
!= 0)
326 ll_stop_statahead(inode
, lli
->lli_opendir_key
);
328 if (inode
->i_sb
->s_root
== file
->f_dentry
) {
329 LUSTRE_FPRIVATE(file
) = NULL
;
330 ll_file_data_put(fd
);
334 if (!S_ISDIR(inode
->i_mode
)) {
335 lov_read_and_clear_async_rc(lli
->lli_clob
);
336 lli
->lli_async_rc
= 0;
339 rc
= ll_md_close(sbi
->ll_md_exp
, inode
, file
);
341 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG
, cfs_fail_val
))
342 libcfs_debug_dumplog();
347 static int ll_intent_file_open(struct file
*file
, void *lmm
,
348 int lmmsize
, struct lookup_intent
*itp
)
350 struct ll_sb_info
*sbi
= ll_i2sbi(file
->f_dentry
->d_inode
);
351 struct dentry
*parent
= file
->f_dentry
->d_parent
;
352 const char *name
= file
->f_dentry
->d_name
.name
;
353 const int len
= file
->f_dentry
->d_name
.len
;
354 struct md_op_data
*op_data
;
355 struct ptlrpc_request
*req
;
356 __u32 opc
= LUSTRE_OPC_ANY
;
362 /* Usually we come here only for NFSD, and we want open lock.
363 But we can also get here with pre 2.6.15 patchless kernels, and in
364 that case that lock is also ok */
365 /* We can also get here if there was cached open handle in revalidate_it
366 * but it disappeared while we were getting from there to ll_file_open.
367 * But this means this file was closed and immediatelly opened which
368 * makes a good candidate for using OPEN lock */
369 /* If lmmsize & lmm are not 0, we are just setting stripe info
370 * parameters. No need for the open lock */
371 if (lmm
== NULL
&& lmmsize
== 0) {
372 itp
->it_flags
|= MDS_OPEN_LOCK
;
373 if (itp
->it_flags
& FMODE_WRITE
)
374 opc
= LUSTRE_OPC_CREATE
;
377 op_data
= ll_prep_md_op_data(NULL
, parent
->d_inode
,
378 file
->f_dentry
->d_inode
, name
, len
,
381 return PTR_ERR(op_data
);
383 itp
->it_flags
|= MDS_OPEN_BY_FID
;
384 rc
= md_intent_lock(sbi
->ll_md_exp
, op_data
, lmm
, lmmsize
, itp
,
385 0 /*unused */, &req
, ll_md_blocking_ast
, 0);
386 ll_finish_md_op_data(op_data
);
388 /* reason for keep own exit path - don`t flood log
389 * with messages with -ESTALE errors.
391 if (!it_disposition(itp
, DISP_OPEN_OPEN
) ||
392 it_open_error(DISP_OPEN_OPEN
, itp
))
394 ll_release_openhandle(file
->f_dentry
, itp
);
398 if (it_disposition(itp
, DISP_LOOKUP_NEG
))
399 GOTO(out
, rc
= -ENOENT
);
401 if (rc
!= 0 || it_open_error(DISP_OPEN_OPEN
, itp
)) {
402 rc
= rc
? rc
: it_open_error(DISP_OPEN_OPEN
, itp
);
403 CDEBUG(D_VFSTRACE
, "lock enqueue: err: %d\n", rc
);
407 rc
= ll_prep_inode(&file
->f_dentry
->d_inode
, req
, NULL
, itp
);
408 if (!rc
&& itp
->d
.lustre
.it_lock_mode
)
409 ll_set_lock_data(sbi
->ll_md_exp
, file
->f_dentry
->d_inode
,
413 ptlrpc_req_finished(itp
->d
.lustre
.it_data
);
414 it_clear_disposition(itp
, DISP_ENQ_COMPLETE
);
415 ll_intent_drop_lock(itp
);
421 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
422 * not believe attributes if a few ioepoch holders exist. Attributes for
423 * previous ioepoch if new one is opened are also skipped by MDS.
425 void ll_ioepoch_open(struct ll_inode_info
*lli
, __u64 ioepoch
)
427 if (ioepoch
&& lli
->lli_ioepoch
!= ioepoch
) {
428 lli
->lli_ioepoch
= ioepoch
;
429 CDEBUG(D_INODE
, "Epoch "LPU64
" opened on "DFID
"\n",
430 ioepoch
, PFID(&lli
->lli_fid
));
434 static int ll_och_fill(struct obd_export
*md_exp
, struct ll_inode_info
*lli
,
435 struct lookup_intent
*it
, struct obd_client_handle
*och
)
437 struct ptlrpc_request
*req
= it
->d
.lustre
.it_data
;
438 struct mdt_body
*body
;
442 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
443 LASSERT(body
!= NULL
); /* reply already checked out */
445 memcpy(&och
->och_fh
, &body
->handle
, sizeof(body
->handle
));
446 och
->och_magic
= OBD_CLIENT_HANDLE_MAGIC
;
447 och
->och_fid
= lli
->lli_fid
;
448 och
->och_flags
= it
->it_flags
;
449 ll_ioepoch_open(lli
, body
->ioepoch
);
451 return md_set_open_replay_data(md_exp
, och
, req
);
454 int ll_local_open(struct file
*file
, struct lookup_intent
*it
,
455 struct ll_file_data
*fd
, struct obd_client_handle
*och
)
457 struct inode
*inode
= file
->f_dentry
->d_inode
;
458 struct ll_inode_info
*lli
= ll_i2info(inode
);
460 LASSERT(!LUSTRE_FPRIVATE(file
));
465 struct ptlrpc_request
*req
= it
->d
.lustre
.it_data
;
466 struct mdt_body
*body
;
469 rc
= ll_och_fill(ll_i2sbi(inode
)->ll_md_exp
, lli
, it
, och
);
473 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
474 if ((it
->it_flags
& FMODE_WRITE
) &&
475 (body
->valid
& OBD_MD_FLSIZE
))
476 CDEBUG(D_INODE
, "Epoch "LPU64
" opened on "DFID
"\n",
477 lli
->lli_ioepoch
, PFID(&lli
->lli_fid
));
480 LUSTRE_FPRIVATE(file
) = fd
;
481 ll_readahead_init(inode
, &fd
->fd_ras
);
482 fd
->fd_omode
= it
->it_flags
;
486 /* Open a file, and (for the very first open) create objects on the OSTs at
487 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
488 * creation or open until ll_lov_setstripe() ioctl is called.
490 * If we already have the stripe MD locally then we don't request it in
491 * md_open(), by passing a lmm_size = 0.
493 * It is up to the application to ensure no other processes open this file
494 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
495 * used. We might be able to avoid races of that sort by getting lli_open_sem
496 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
497 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
499 int ll_file_open(struct inode
*inode
, struct file
*file
)
501 struct ll_inode_info
*lli
= ll_i2info(inode
);
502 struct lookup_intent
*it
, oit
= { .it_op
= IT_OPEN
,
503 .it_flags
= file
->f_flags
};
504 struct obd_client_handle
**och_p
= NULL
;
505 __u64
*och_usecount
= NULL
;
506 struct ll_file_data
*fd
;
507 int rc
= 0, opendir_set
= 0;
509 CDEBUG(D_VFSTRACE
, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode
->i_ino
,
510 inode
->i_generation
, inode
, file
->f_flags
);
512 it
= file
->private_data
; /* XXX: compat macro */
513 file
->private_data
= NULL
; /* prevent ll_local_open assertion */
515 fd
= ll_file_data_get();
517 GOTO(out_openerr
, rc
= -ENOMEM
);
520 if (S_ISDIR(inode
->i_mode
)) {
521 spin_lock(&lli
->lli_sa_lock
);
522 if (lli
->lli_opendir_key
== NULL
&& lli
->lli_sai
== NULL
&&
523 lli
->lli_opendir_pid
== 0) {
524 lli
->lli_opendir_key
= fd
;
525 lli
->lli_opendir_pid
= current_pid();
528 spin_unlock(&lli
->lli_sa_lock
);
531 if (inode
->i_sb
->s_root
== file
->f_dentry
) {
532 LUSTRE_FPRIVATE(file
) = fd
;
536 if (!it
|| !it
->d
.lustre
.it_disposition
) {
537 /* Convert f_flags into access mode. We cannot use file->f_mode,
538 * because everything but O_ACCMODE mask was stripped from
540 if ((oit
.it_flags
+ 1) & O_ACCMODE
)
542 if (file
->f_flags
& O_TRUNC
)
543 oit
.it_flags
|= FMODE_WRITE
;
545 /* kernel only call f_op->open in dentry_open. filp_open calls
546 * dentry_open after call to open_namei that checks permissions.
547 * Only nfsd_open call dentry_open directly without checking
548 * permissions and because of that this code below is safe. */
549 if (oit
.it_flags
& (FMODE_WRITE
| FMODE_READ
))
550 oit
.it_flags
|= MDS_OPEN_OWNEROVERRIDE
;
552 /* We do not want O_EXCL here, presumably we opened the file
553 * already? XXX - NFS implications? */
554 oit
.it_flags
&= ~O_EXCL
;
556 /* bug20584, if "it_flags" contains O_CREAT, the file will be
557 * created if necessary, then "IT_CREAT" should be set to keep
558 * consistent with it */
559 if (oit
.it_flags
& O_CREAT
)
560 oit
.it_op
|= IT_CREAT
;
566 /* Let's see if we have file open on MDS already. */
567 if (it
->it_flags
& FMODE_WRITE
) {
568 och_p
= &lli
->lli_mds_write_och
;
569 och_usecount
= &lli
->lli_open_fd_write_count
;
570 } else if (it
->it_flags
& FMODE_EXEC
) {
571 och_p
= &lli
->lli_mds_exec_och
;
572 och_usecount
= &lli
->lli_open_fd_exec_count
;
574 och_p
= &lli
->lli_mds_read_och
;
575 och_usecount
= &lli
->lli_open_fd_read_count
;
578 mutex_lock(&lli
->lli_och_mutex
);
579 if (*och_p
) { /* Open handle is present */
580 if (it_disposition(it
, DISP_OPEN_OPEN
)) {
581 /* Well, there's extra open request that we do not need,
582 let's close it somehow. This will decref request. */
583 rc
= it_open_error(DISP_OPEN_OPEN
, it
);
585 mutex_unlock(&lli
->lli_och_mutex
);
586 GOTO(out_openerr
, rc
);
589 ll_release_openhandle(file
->f_dentry
, it
);
593 rc
= ll_local_open(file
, it
, fd
, NULL
);
596 mutex_unlock(&lli
->lli_och_mutex
);
597 GOTO(out_openerr
, rc
);
600 LASSERT(*och_usecount
== 0);
601 if (!it
->d
.lustre
.it_disposition
) {
602 /* We cannot just request lock handle now, new ELC code
603 means that one of other OPEN locks for this file
604 could be cancelled, and since blocking ast handler
605 would attempt to grab och_mutex as well, that would
606 result in a deadlock */
607 mutex_unlock(&lli
->lli_och_mutex
);
608 it
->it_create_mode
|= M_CHECK_STALE
;
609 rc
= ll_intent_file_open(file
, NULL
, 0, it
);
610 it
->it_create_mode
&= ~M_CHECK_STALE
;
612 GOTO(out_openerr
, rc
);
616 OBD_ALLOC(*och_p
, sizeof (struct obd_client_handle
));
618 GOTO(out_och_free
, rc
= -ENOMEM
);
622 /* md_intent_lock() didn't get a request ref if there was an
623 * open error, so don't do cleanup on the request here
625 /* XXX (green): Should not we bail out on any error here, not
626 * just open error? */
627 rc
= it_open_error(DISP_OPEN_OPEN
, it
);
629 GOTO(out_och_free
, rc
);
631 LASSERT(it_disposition(it
, DISP_ENQ_OPEN_REF
));
633 rc
= ll_local_open(file
, it
, fd
, *och_p
);
635 GOTO(out_och_free
, rc
);
637 mutex_unlock(&lli
->lli_och_mutex
);
640 /* Must do this outside lli_och_mutex lock to prevent deadlock where
641 different kind of OPEN lock for this same inode gets cancelled
642 by ldlm_cancel_lru */
643 if (!S_ISREG(inode
->i_mode
))
644 GOTO(out_och_free
, rc
);
648 if (!lli
->lli_has_smd
) {
649 if (file
->f_flags
& O_LOV_DELAY_CREATE
||
650 !(file
->f_mode
& FMODE_WRITE
)) {
651 CDEBUG(D_INODE
, "object creation was delayed\n");
652 GOTO(out_och_free
, rc
);
655 file
->f_flags
&= ~O_LOV_DELAY_CREATE
;
656 GOTO(out_och_free
, rc
);
660 if (och_p
&& *och_p
) {
661 OBD_FREE(*och_p
, sizeof (struct obd_client_handle
));
662 *och_p
= NULL
; /* OBD_FREE writes some magic there */
665 mutex_unlock(&lli
->lli_och_mutex
);
668 if (opendir_set
!= 0)
669 ll_stop_statahead(inode
, lli
->lli_opendir_key
);
671 ll_file_data_put(fd
);
673 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_OPEN
, 1);
676 if (it
&& it_disposition(it
, DISP_ENQ_OPEN_REF
)) {
677 ptlrpc_req_finished(it
->d
.lustre
.it_data
);
678 it_clear_disposition(it
, DISP_ENQ_OPEN_REF
);
684 /* Fills the obdo with the attributes for the lsm */
685 static int ll_lsm_getattr(struct lov_stripe_md
*lsm
, struct obd_export
*exp
,
686 struct obd_capa
*capa
, struct obdo
*obdo
,
687 __u64 ioepoch
, int sync
)
689 struct ptlrpc_request_set
*set
;
690 struct obd_info oinfo
= { { { 0 } } };
693 LASSERT(lsm
!= NULL
);
697 oinfo
.oi_oa
->o_oi
= lsm
->lsm_oi
;
698 oinfo
.oi_oa
->o_mode
= S_IFREG
;
699 oinfo
.oi_oa
->o_ioepoch
= ioepoch
;
700 oinfo
.oi_oa
->o_valid
= OBD_MD_FLID
| OBD_MD_FLTYPE
|
701 OBD_MD_FLSIZE
| OBD_MD_FLBLOCKS
|
702 OBD_MD_FLBLKSZ
| OBD_MD_FLATIME
|
703 OBD_MD_FLMTIME
| OBD_MD_FLCTIME
|
704 OBD_MD_FLGROUP
| OBD_MD_FLEPOCH
|
705 OBD_MD_FLDATAVERSION
;
706 oinfo
.oi_capa
= capa
;
708 oinfo
.oi_oa
->o_valid
|= OBD_MD_FLFLAGS
;
709 oinfo
.oi_oa
->o_flags
|= OBD_FL_SRVLOCK
;
712 set
= ptlrpc_prep_set();
714 CERROR("can't allocate ptlrpc set\n");
717 rc
= obd_getattr_async(exp
, &oinfo
, set
);
719 rc
= ptlrpc_set_wait(set
);
720 ptlrpc_set_destroy(set
);
723 oinfo
.oi_oa
->o_valid
&= (OBD_MD_FLBLOCKS
| OBD_MD_FLBLKSZ
|
724 OBD_MD_FLATIME
| OBD_MD_FLMTIME
|
725 OBD_MD_FLCTIME
| OBD_MD_FLSIZE
|
726 OBD_MD_FLDATAVERSION
);
731 * Performs the getattr on the inode and updates its fields.
732 * If @sync != 0, perform the getattr under the server-side lock.
734 int ll_inode_getattr(struct inode
*inode
, struct obdo
*obdo
,
735 __u64 ioepoch
, int sync
)
737 struct obd_capa
*capa
= ll_mdscapa_get(inode
);
738 struct lov_stripe_md
*lsm
;
741 lsm
= ccc_inode_lsm_get(inode
);
742 rc
= ll_lsm_getattr(lsm
, ll_i2dtexp(inode
),
743 capa
, obdo
, ioepoch
, sync
);
746 struct ost_id
*oi
= lsm
? &lsm
->lsm_oi
: &obdo
->o_oi
;
748 obdo_refresh_inode(inode
, obdo
, obdo
->o_valid
);
749 CDEBUG(D_INODE
, "objid "DOSTID
" size %llu, blocks %llu,"
750 " blksize %lu\n", POSTID(oi
), i_size_read(inode
),
751 (unsigned long long)inode
->i_blocks
,
752 (unsigned long)ll_inode_blksize(inode
));
754 ccc_inode_lsm_put(inode
, lsm
);
758 int ll_merge_lvb(const struct lu_env
*env
, struct inode
*inode
)
760 struct ll_inode_info
*lli
= ll_i2info(inode
);
761 struct cl_object
*obj
= lli
->lli_clob
;
762 struct cl_attr
*attr
= ccc_env_thread_attr(env
);
766 ll_inode_size_lock(inode
);
767 /* merge timestamps the most recently obtained from mds with
768 timestamps obtained from osts */
769 LTIME_S(inode
->i_atime
) = lli
->lli_lvb
.lvb_atime
;
770 LTIME_S(inode
->i_mtime
) = lli
->lli_lvb
.lvb_mtime
;
771 LTIME_S(inode
->i_ctime
) = lli
->lli_lvb
.lvb_ctime
;
772 inode_init_lvb(inode
, &lvb
);
774 cl_object_attr_lock(obj
);
775 rc
= cl_object_attr_get(env
, obj
, attr
);
776 cl_object_attr_unlock(obj
);
779 if (lvb
.lvb_atime
< attr
->cat_atime
)
780 lvb
.lvb_atime
= attr
->cat_atime
;
781 if (lvb
.lvb_ctime
< attr
->cat_ctime
)
782 lvb
.lvb_ctime
= attr
->cat_ctime
;
783 if (lvb
.lvb_mtime
< attr
->cat_mtime
)
784 lvb
.lvb_mtime
= attr
->cat_mtime
;
786 CDEBUG(D_VFSTRACE
, DFID
" updating i_size "LPU64
"\n",
787 PFID(&lli
->lli_fid
), attr
->cat_size
);
788 cl_isize_write_nolock(inode
, attr
->cat_size
);
790 inode
->i_blocks
= attr
->cat_blocks
;
792 LTIME_S(inode
->i_mtime
) = lvb
.lvb_mtime
;
793 LTIME_S(inode
->i_atime
) = lvb
.lvb_atime
;
794 LTIME_S(inode
->i_ctime
) = lvb
.lvb_ctime
;
796 ll_inode_size_unlock(inode
);
801 int ll_glimpse_ioctl(struct ll_sb_info
*sbi
, struct lov_stripe_md
*lsm
,
804 struct obdo obdo
= { 0 };
807 rc
= ll_lsm_getattr(lsm
, sbi
->ll_dt_exp
, NULL
, &obdo
, 0, 0);
809 st
->st_size
= obdo
.o_size
;
810 st
->st_blocks
= obdo
.o_blocks
;
811 st
->st_mtime
= obdo
.o_mtime
;
812 st
->st_atime
= obdo
.o_atime
;
813 st
->st_ctime
= obdo
.o_ctime
;
818 void ll_io_init(struct cl_io
*io
, const struct file
*file
, int write
)
820 struct inode
*inode
= file
->f_dentry
->d_inode
;
822 io
->u
.ci_rw
.crw_nonblock
= file
->f_flags
& O_NONBLOCK
;
824 io
->u
.ci_wr
.wr_append
= !!(file
->f_flags
& O_APPEND
);
825 io
->u
.ci_wr
.wr_sync
= file
->f_flags
& O_SYNC
||
826 file
->f_flags
& O_DIRECT
||
829 io
->ci_obj
= ll_i2info(inode
)->lli_clob
;
830 io
->ci_lockreq
= CILR_MAYBE
;
831 if (ll_file_nolock(file
)) {
832 io
->ci_lockreq
= CILR_NEVER
;
833 io
->ci_no_srvlock
= 1;
834 } else if (file
->f_flags
& O_APPEND
) {
835 io
->ci_lockreq
= CILR_MANDATORY
;
840 ll_file_io_generic(const struct lu_env
*env
, struct vvp_io_args
*args
,
841 struct file
*file
, enum cl_io_type iot
,
842 loff_t
*ppos
, size_t count
)
844 struct ll_inode_info
*lli
= ll_i2info(file
->f_dentry
->d_inode
);
845 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
850 io
= ccc_env_thread_io(env
);
851 ll_io_init(io
, file
, iot
== CIT_WRITE
);
853 if (cl_io_rw_init(env
, io
, iot
, *ppos
, count
) == 0) {
854 struct vvp_io
*vio
= vvp_env_io(env
);
855 struct ccc_io
*cio
= ccc_env_io(env
);
856 int write_mutex_locked
= 0;
858 cio
->cui_fd
= LUSTRE_FPRIVATE(file
);
859 vio
->cui_io_subtype
= args
->via_io_subtype
;
861 switch (vio
->cui_io_subtype
) {
863 cio
->cui_iov
= args
->u
.normal
.via_iov
;
864 cio
->cui_nrsegs
= args
->u
.normal
.via_nrsegs
;
865 cio
->cui_tot_nrsegs
= cio
->cui_nrsegs
;
866 cio
->cui_iocb
= args
->u
.normal
.via_iocb
;
867 if ((iot
== CIT_WRITE
) &&
868 !(cio
->cui_fd
->fd_flags
& LL_FILE_GROUP_LOCKED
)) {
869 if (mutex_lock_interruptible(&lli
->
871 GOTO(out
, result
= -ERESTARTSYS
);
872 write_mutex_locked
= 1;
873 } else if (iot
== CIT_READ
) {
874 down_read(&lli
->lli_trunc_sem
);
878 vio
->u
.sendfile
.cui_actor
= args
->u
.sendfile
.via_actor
;
879 vio
->u
.sendfile
.cui_target
= args
->u
.sendfile
.via_target
;
882 vio
->u
.splice
.cui_pipe
= args
->u
.splice
.via_pipe
;
883 vio
->u
.splice
.cui_flags
= args
->u
.splice
.via_flags
;
886 CERROR("Unknow IO type - %u\n", vio
->cui_io_subtype
);
889 result
= cl_io_loop(env
, io
);
890 if (write_mutex_locked
)
891 mutex_unlock(&lli
->lli_write_mutex
);
892 else if (args
->via_io_subtype
== IO_NORMAL
&& iot
== CIT_READ
)
893 up_read(&lli
->lli_trunc_sem
);
895 /* cl_io_rw_init() handled IO */
896 result
= io
->ci_result
;
899 if (io
->ci_nob
> 0) {
901 *ppos
= io
->u
.ci_wr
.wr
.crw_pos
;
906 /* If any bit been read/written (result != 0), we just return
907 * short read/write instead of restart io. */
908 if (result
== 0 && io
->ci_need_restart
) {
909 CDEBUG(D_VFSTRACE
, "Restart %s on %s from %lld, count:%zd\n",
910 iot
== CIT_READ
? "read" : "write",
911 file
->f_dentry
->d_name
.name
, *ppos
, count
);
912 LASSERTF(io
->ci_nob
== 0, "%zd", io
->ci_nob
);
916 if (iot
== CIT_READ
) {
918 ll_stats_ops_tally(ll_i2sbi(file
->f_dentry
->d_inode
),
919 LPROC_LL_READ_BYTES
, result
);
920 } else if (iot
== CIT_WRITE
) {
922 ll_stats_ops_tally(ll_i2sbi(file
->f_dentry
->d_inode
),
923 LPROC_LL_WRITE_BYTES
, result
);
924 fd
->fd_write_failed
= false;
925 } else if (result
!= -ERESTARTSYS
) {
926 fd
->fd_write_failed
= true;
935 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
937 static int ll_file_get_iov_count(const struct iovec
*iov
,
938 unsigned long *nr_segs
, size_t *count
)
943 for (seg
= 0; seg
< *nr_segs
; seg
++) {
944 const struct iovec
*iv
= &iov
[seg
];
947 * If any segment has a negative length, or the cumulative
948 * length ever wraps negative then return -EINVAL.
951 if (unlikely((ssize_t
)(cnt
|iv
->iov_len
) < 0))
953 if (access_ok(VERIFY_READ
, iv
->iov_base
, iv
->iov_len
))
958 cnt
-= iv
->iov_len
; /* This segment is no good */
965 static ssize_t
ll_file_aio_read(struct kiocb
*iocb
, const struct iovec
*iov
,
966 unsigned long nr_segs
, loff_t pos
)
969 struct vvp_io_args
*args
;
974 result
= ll_file_get_iov_count(iov
, &nr_segs
, &count
);
978 env
= cl_env_get(&refcheck
);
982 args
= vvp_env_args(env
, IO_NORMAL
);
983 args
->u
.normal
.via_iov
= (struct iovec
*)iov
;
984 args
->u
.normal
.via_nrsegs
= nr_segs
;
985 args
->u
.normal
.via_iocb
= iocb
;
987 result
= ll_file_io_generic(env
, args
, iocb
->ki_filp
, CIT_READ
,
988 &iocb
->ki_pos
, count
);
989 cl_env_put(env
, &refcheck
);
993 static ssize_t
ll_file_read(struct file
*file
, char *buf
, size_t count
,
997 struct iovec
*local_iov
;
1002 env
= cl_env_get(&refcheck
);
1004 return PTR_ERR(env
);
1006 local_iov
= &vvp_env_info(env
)->vti_local_iov
;
1007 kiocb
= &vvp_env_info(env
)->vti_kiocb
;
1008 local_iov
->iov_base
= (void __user
*)buf
;
1009 local_iov
->iov_len
= count
;
1010 init_sync_kiocb(kiocb
, file
);
1011 kiocb
->ki_pos
= *ppos
;
1012 kiocb
->ki_nbytes
= count
;
1014 result
= ll_file_aio_read(kiocb
, local_iov
, 1, kiocb
->ki_pos
);
1015 *ppos
= kiocb
->ki_pos
;
1017 cl_env_put(env
, &refcheck
);
1022 * Write to a file (through the page cache).
1024 static ssize_t
ll_file_aio_write(struct kiocb
*iocb
, const struct iovec
*iov
,
1025 unsigned long nr_segs
, loff_t pos
)
1028 struct vvp_io_args
*args
;
1033 result
= ll_file_get_iov_count(iov
, &nr_segs
, &count
);
1037 env
= cl_env_get(&refcheck
);
1039 return PTR_ERR(env
);
1041 args
= vvp_env_args(env
, IO_NORMAL
);
1042 args
->u
.normal
.via_iov
= (struct iovec
*)iov
;
1043 args
->u
.normal
.via_nrsegs
= nr_segs
;
1044 args
->u
.normal
.via_iocb
= iocb
;
1046 result
= ll_file_io_generic(env
, args
, iocb
->ki_filp
, CIT_WRITE
,
1047 &iocb
->ki_pos
, count
);
1048 cl_env_put(env
, &refcheck
);
1052 static ssize_t
ll_file_write(struct file
*file
, const char *buf
, size_t count
,
1056 struct iovec
*local_iov
;
1057 struct kiocb
*kiocb
;
1061 env
= cl_env_get(&refcheck
);
1063 return PTR_ERR(env
);
1065 local_iov
= &vvp_env_info(env
)->vti_local_iov
;
1066 kiocb
= &vvp_env_info(env
)->vti_kiocb
;
1067 local_iov
->iov_base
= (void __user
*)buf
;
1068 local_iov
->iov_len
= count
;
1069 init_sync_kiocb(kiocb
, file
);
1070 kiocb
->ki_pos
= *ppos
;
1071 kiocb
->ki_nbytes
= count
;
1073 result
= ll_file_aio_write(kiocb
, local_iov
, 1, kiocb
->ki_pos
);
1074 *ppos
= kiocb
->ki_pos
;
1076 cl_env_put(env
, &refcheck
);
1083 * Send file content (through pagecache) somewhere with helper
1085 static ssize_t
ll_file_splice_read(struct file
*in_file
, loff_t
*ppos
,
1086 struct pipe_inode_info
*pipe
, size_t count
,
1090 struct vvp_io_args
*args
;
1094 env
= cl_env_get(&refcheck
);
1096 return PTR_ERR(env
);
1098 args
= vvp_env_args(env
, IO_SPLICE
);
1099 args
->u
.splice
.via_pipe
= pipe
;
1100 args
->u
.splice
.via_flags
= flags
;
1102 result
= ll_file_io_generic(env
, args
, in_file
, CIT_READ
, ppos
, count
);
1103 cl_env_put(env
, &refcheck
);
1107 static int ll_lov_recreate(struct inode
*inode
, struct ost_id
*oi
,
1110 struct obd_export
*exp
= ll_i2dtexp(inode
);
1111 struct obd_trans_info oti
= { 0 };
1112 struct obdo
*oa
= NULL
;
1115 struct lov_stripe_md
*lsm
= NULL
, *lsm2
;
1121 lsm
= ccc_inode_lsm_get(inode
);
1122 if (!lsm_has_objects(lsm
))
1123 GOTO(out
, rc
= -ENOENT
);
1125 lsm_size
= sizeof(*lsm
) + (sizeof(struct lov_oinfo
) *
1126 (lsm
->lsm_stripe_count
));
1128 OBD_ALLOC_LARGE(lsm2
, lsm_size
);
1130 GOTO(out
, rc
= -ENOMEM
);
1133 oa
->o_nlink
= ost_idx
;
1134 oa
->o_flags
|= OBD_FL_RECREATE_OBJS
;
1135 oa
->o_valid
= OBD_MD_FLID
| OBD_MD_FLFLAGS
| OBD_MD_FLGROUP
;
1136 obdo_from_inode(oa
, inode
, OBD_MD_FLTYPE
| OBD_MD_FLATIME
|
1137 OBD_MD_FLMTIME
| OBD_MD_FLCTIME
);
1138 obdo_set_parent_fid(oa
, &ll_i2info(inode
)->lli_fid
);
1139 memcpy(lsm2
, lsm
, lsm_size
);
1140 ll_inode_size_lock(inode
);
1141 rc
= obd_create(NULL
, exp
, oa
, &lsm2
, &oti
);
1142 ll_inode_size_unlock(inode
);
1144 OBD_FREE_LARGE(lsm2
, lsm_size
);
1147 ccc_inode_lsm_put(inode
, lsm
);
1152 static int ll_lov_recreate_obj(struct inode
*inode
, unsigned long arg
)
1154 struct ll_recreate_obj ucreat
;
1157 if (!cfs_capable(CFS_CAP_SYS_ADMIN
))
1160 if (copy_from_user(&ucreat
, (struct ll_recreate_obj
*)arg
,
1164 ostid_set_seq_mdt0(&oi
);
1165 ostid_set_id(&oi
, ucreat
.lrc_id
);
1166 return ll_lov_recreate(inode
, &oi
, ucreat
.lrc_ost_idx
);
1169 static int ll_lov_recreate_fid(struct inode
*inode
, unsigned long arg
)
1175 if (!cfs_capable(CFS_CAP_SYS_ADMIN
))
1178 if (copy_from_user(&fid
, (struct lu_fid
*)arg
, sizeof(fid
)))
1181 fid_to_ostid(&fid
, &oi
);
1182 ost_idx
= (fid_seq(&fid
) >> 16) & 0xffff;
1183 return ll_lov_recreate(inode
, &oi
, ost_idx
);
1186 int ll_lov_setstripe_ea_info(struct inode
*inode
, struct file
*file
,
1187 int flags
, struct lov_user_md
*lum
, int lum_size
)
1189 struct lov_stripe_md
*lsm
= NULL
;
1190 struct lookup_intent oit
= {.it_op
= IT_OPEN
, .it_flags
= flags
};
1193 lsm
= ccc_inode_lsm_get(inode
);
1195 ccc_inode_lsm_put(inode
, lsm
);
1196 CDEBUG(D_IOCTL
, "stripe already exists for ino %lu\n",
1201 ll_inode_size_lock(inode
);
1202 rc
= ll_intent_file_open(file
, lum
, lum_size
, &oit
);
1205 rc
= oit
.d
.lustre
.it_status
;
1207 GOTO(out_req_free
, rc
);
1209 ll_release_openhandle(file
->f_dentry
, &oit
);
1212 ll_inode_size_unlock(inode
);
1213 ll_intent_release(&oit
);
1214 ccc_inode_lsm_put(inode
, lsm
);
1217 ptlrpc_req_finished((struct ptlrpc_request
*) oit
.d
.lustre
.it_data
);
1221 int ll_lov_getstripe_ea_info(struct inode
*inode
, const char *filename
,
1222 struct lov_mds_md
**lmmp
, int *lmm_size
,
1223 struct ptlrpc_request
**request
)
1225 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
1226 struct mdt_body
*body
;
1227 struct lov_mds_md
*lmm
= NULL
;
1228 struct ptlrpc_request
*req
= NULL
;
1229 struct md_op_data
*op_data
;
1232 rc
= ll_get_max_mdsize(sbi
, &lmmsize
);
1236 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, filename
,
1237 strlen(filename
), lmmsize
,
1238 LUSTRE_OPC_ANY
, NULL
);
1239 if (IS_ERR(op_data
))
1240 return PTR_ERR(op_data
);
1242 op_data
->op_valid
= OBD_MD_FLEASIZE
| OBD_MD_FLDIREA
;
1243 rc
= md_getattr_name(sbi
->ll_md_exp
, op_data
, &req
);
1244 ll_finish_md_op_data(op_data
);
1246 CDEBUG(D_INFO
, "md_getattr_name failed "
1247 "on %s: rc %d\n", filename
, rc
);
1251 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
1252 LASSERT(body
!= NULL
); /* checked by mdc_getattr_name */
1254 lmmsize
= body
->eadatasize
;
1256 if (!(body
->valid
& (OBD_MD_FLEASIZE
| OBD_MD_FLDIREA
)) ||
1258 GOTO(out
, rc
= -ENODATA
);
1261 lmm
= req_capsule_server_sized_get(&req
->rq_pill
, &RMF_MDT_MD
, lmmsize
);
1262 LASSERT(lmm
!= NULL
);
1264 if ((lmm
->lmm_magic
!= cpu_to_le32(LOV_MAGIC_V1
)) &&
1265 (lmm
->lmm_magic
!= cpu_to_le32(LOV_MAGIC_V3
))) {
1266 GOTO(out
, rc
= -EPROTO
);
1270 * This is coming from the MDS, so is probably in
1271 * little endian. We convert it to host endian before
1272 * passing it to userspace.
1274 if (LOV_MAGIC
!= cpu_to_le32(LOV_MAGIC
)) {
1277 stripe_count
= le16_to_cpu(lmm
->lmm_stripe_count
);
1278 if (le32_to_cpu(lmm
->lmm_pattern
) & LOV_PATTERN_F_RELEASED
)
1281 /* if function called for directory - we should
1282 * avoid swab not existent lsm objects */
1283 if (lmm
->lmm_magic
== cpu_to_le32(LOV_MAGIC_V1
)) {
1284 lustre_swab_lov_user_md_v1((struct lov_user_md_v1
*)lmm
);
1285 if (S_ISREG(body
->mode
))
1286 lustre_swab_lov_user_md_objects(
1287 ((struct lov_user_md_v1
*)lmm
)->lmm_objects
,
1289 } else if (lmm
->lmm_magic
== cpu_to_le32(LOV_MAGIC_V3
)) {
1290 lustre_swab_lov_user_md_v3((struct lov_user_md_v3
*)lmm
);
1291 if (S_ISREG(body
->mode
))
1292 lustre_swab_lov_user_md_objects(
1293 ((struct lov_user_md_v3
*)lmm
)->lmm_objects
,
1300 *lmm_size
= lmmsize
;
1305 static int ll_lov_setea(struct inode
*inode
, struct file
*file
,
1308 int flags
= MDS_OPEN_HAS_OBJS
| FMODE_WRITE
;
1309 struct lov_user_md
*lump
;
1310 int lum_size
= sizeof(struct lov_user_md
) +
1311 sizeof(struct lov_user_ost_data
);
1314 if (!cfs_capable(CFS_CAP_SYS_ADMIN
))
1317 OBD_ALLOC_LARGE(lump
, lum_size
);
1321 if (copy_from_user(lump
, (struct lov_user_md
*)arg
, lum_size
)) {
1322 OBD_FREE_LARGE(lump
, lum_size
);
1326 rc
= ll_lov_setstripe_ea_info(inode
, file
, flags
, lump
, lum_size
);
1328 OBD_FREE_LARGE(lump
, lum_size
);
1332 static int ll_lov_setstripe(struct inode
*inode
, struct file
*file
,
1335 struct lov_user_md_v3 lumv3
;
1336 struct lov_user_md_v1
*lumv1
= (struct lov_user_md_v1
*)&lumv3
;
1337 struct lov_user_md_v1
*lumv1p
= (struct lov_user_md_v1
*)arg
;
1338 struct lov_user_md_v3
*lumv3p
= (struct lov_user_md_v3
*)arg
;
1340 int flags
= FMODE_WRITE
;
1342 /* first try with v1 which is smaller than v3 */
1343 lum_size
= sizeof(struct lov_user_md_v1
);
1344 if (copy_from_user(lumv1
, lumv1p
, lum_size
))
1347 if (lumv1
->lmm_magic
== LOV_USER_MAGIC_V3
) {
1348 lum_size
= sizeof(struct lov_user_md_v3
);
1349 if (copy_from_user(&lumv3
, lumv3p
, lum_size
))
1353 rc
= ll_lov_setstripe_ea_info(inode
, file
, flags
, lumv1
, lum_size
);
1355 struct lov_stripe_md
*lsm
;
1358 put_user(0, &lumv1p
->lmm_stripe_count
);
1360 ll_layout_refresh(inode
, &gen
);
1361 lsm
= ccc_inode_lsm_get(inode
);
1362 rc
= obd_iocontrol(LL_IOC_LOV_GETSTRIPE
, ll_i2dtexp(inode
),
1363 0, lsm
, (void *)arg
);
1364 ccc_inode_lsm_put(inode
, lsm
);
1369 static int ll_lov_getstripe(struct inode
*inode
, unsigned long arg
)
1371 struct lov_stripe_md
*lsm
;
1374 lsm
= ccc_inode_lsm_get(inode
);
1376 rc
= obd_iocontrol(LL_IOC_LOV_GETSTRIPE
, ll_i2dtexp(inode
), 0,
1378 ccc_inode_lsm_put(inode
, lsm
);
1382 int ll_get_grouplock(struct inode
*inode
, struct file
*file
, unsigned long arg
)
1384 struct ll_inode_info
*lli
= ll_i2info(inode
);
1385 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1386 struct ccc_grouplock grouplock
;
1389 if (ll_file_nolock(file
))
1392 spin_lock(&lli
->lli_lock
);
1393 if (fd
->fd_flags
& LL_FILE_GROUP_LOCKED
) {
1394 CWARN("group lock already existed with gid %lu\n",
1395 fd
->fd_grouplock
.cg_gid
);
1396 spin_unlock(&lli
->lli_lock
);
1399 LASSERT(fd
->fd_grouplock
.cg_lock
== NULL
);
1400 spin_unlock(&lli
->lli_lock
);
1402 rc
= cl_get_grouplock(cl_i2info(inode
)->lli_clob
,
1403 arg
, (file
->f_flags
& O_NONBLOCK
), &grouplock
);
1407 spin_lock(&lli
->lli_lock
);
1408 if (fd
->fd_flags
& LL_FILE_GROUP_LOCKED
) {
1409 spin_unlock(&lli
->lli_lock
);
1410 CERROR("another thread just won the race\n");
1411 cl_put_grouplock(&grouplock
);
1415 fd
->fd_flags
|= LL_FILE_GROUP_LOCKED
;
1416 fd
->fd_grouplock
= grouplock
;
1417 spin_unlock(&lli
->lli_lock
);
1419 CDEBUG(D_INFO
, "group lock %lu obtained\n", arg
);
1423 int ll_put_grouplock(struct inode
*inode
, struct file
*file
, unsigned long arg
)
1425 struct ll_inode_info
*lli
= ll_i2info(inode
);
1426 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1427 struct ccc_grouplock grouplock
;
1429 spin_lock(&lli
->lli_lock
);
1430 if (!(fd
->fd_flags
& LL_FILE_GROUP_LOCKED
)) {
1431 spin_unlock(&lli
->lli_lock
);
1432 CWARN("no group lock held\n");
1435 LASSERT(fd
->fd_grouplock
.cg_lock
!= NULL
);
1437 if (fd
->fd_grouplock
.cg_gid
!= arg
) {
1438 CWARN("group lock %lu doesn't match current id %lu\n",
1439 arg
, fd
->fd_grouplock
.cg_gid
);
1440 spin_unlock(&lli
->lli_lock
);
1444 grouplock
= fd
->fd_grouplock
;
1445 memset(&fd
->fd_grouplock
, 0, sizeof(fd
->fd_grouplock
));
1446 fd
->fd_flags
&= ~LL_FILE_GROUP_LOCKED
;
1447 spin_unlock(&lli
->lli_lock
);
1449 cl_put_grouplock(&grouplock
);
1450 CDEBUG(D_INFO
, "group lock %lu released\n", arg
);
1455 * Close inode open handle
1457 * \param dentry [in] dentry which contains the inode
1458 * \param it [in,out] intent which contains open info and result
1461 * \retval <0 failure
1463 int ll_release_openhandle(struct dentry
*dentry
, struct lookup_intent
*it
)
1465 struct inode
*inode
= dentry
->d_inode
;
1466 struct obd_client_handle
*och
;
1471 /* Root ? Do nothing. */
1472 if (dentry
->d_inode
->i_sb
->s_root
== dentry
)
1475 /* No open handle to close? Move away */
1476 if (!it_disposition(it
, DISP_OPEN_OPEN
))
1479 LASSERT(it_open_error(DISP_OPEN_OPEN
, it
) == 0);
1481 OBD_ALLOC(och
, sizeof(*och
));
1483 GOTO(out
, rc
= -ENOMEM
);
1485 ll_och_fill(ll_i2sbi(inode
)->ll_md_exp
,
1486 ll_i2info(inode
), it
, och
);
1488 rc
= ll_close_inode_openhandle(ll_i2sbi(inode
)->ll_md_exp
,
1491 /* this one is in place of ll_file_open */
1492 if (it_disposition(it
, DISP_ENQ_OPEN_REF
)) {
1493 ptlrpc_req_finished(it
->d
.lustre
.it_data
);
1494 it_clear_disposition(it
, DISP_ENQ_OPEN_REF
);
1500 * Get size for inode for which FIEMAP mapping is requested.
1501 * Make the FIEMAP get_info call and returns the result.
1503 int ll_do_fiemap(struct inode
*inode
, struct ll_user_fiemap
*fiemap
,
1506 struct obd_export
*exp
= ll_i2dtexp(inode
);
1507 struct lov_stripe_md
*lsm
= NULL
;
1508 struct ll_fiemap_info_key fm_key
= { .name
= KEY_FIEMAP
, };
1509 int vallen
= num_bytes
;
1512 /* Checks for fiemap flags */
1513 if (fiemap
->fm_flags
& ~LUSTRE_FIEMAP_FLAGS_COMPAT
) {
1514 fiemap
->fm_flags
&= ~LUSTRE_FIEMAP_FLAGS_COMPAT
;
1518 /* Check for FIEMAP_FLAG_SYNC */
1519 if (fiemap
->fm_flags
& FIEMAP_FLAG_SYNC
) {
1520 rc
= filemap_fdatawrite(inode
->i_mapping
);
1525 lsm
= ccc_inode_lsm_get(inode
);
1529 /* If the stripe_count > 1 and the application does not understand
1530 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1532 if (lsm
->lsm_stripe_count
> 1 &&
1533 !(fiemap
->fm_flags
& FIEMAP_FLAG_DEVICE_ORDER
))
1534 GOTO(out
, rc
= -EOPNOTSUPP
);
1536 fm_key
.oa
.o_oi
= lsm
->lsm_oi
;
1537 fm_key
.oa
.o_valid
= OBD_MD_FLID
| OBD_MD_FLGROUP
;
1539 obdo_from_inode(&fm_key
.oa
, inode
, OBD_MD_FLSIZE
);
1540 obdo_set_parent_fid(&fm_key
.oa
, &ll_i2info(inode
)->lli_fid
);
1541 /* If filesize is 0, then there would be no objects for mapping */
1542 if (fm_key
.oa
.o_size
== 0) {
1543 fiemap
->fm_mapped_extents
= 0;
1547 memcpy(&fm_key
.fiemap
, fiemap
, sizeof(*fiemap
));
1549 rc
= obd_get_info(NULL
, exp
, sizeof(fm_key
), &fm_key
, &vallen
,
1552 CERROR("obd_get_info failed: rc = %d\n", rc
);
1555 ccc_inode_lsm_put(inode
, lsm
);
1559 int ll_fid2path(struct inode
*inode
, void *arg
)
1561 struct obd_export
*exp
= ll_i2mdexp(inode
);
1562 struct getinfo_fid2path
*gfout
, *gfin
;
1565 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH
) &&
1566 !(ll_i2sbi(inode
)->ll_flags
& LL_SBI_USER_FID2PATH
))
1569 /* Need to get the buflen */
1570 OBD_ALLOC_PTR(gfin
);
1573 if (copy_from_user(gfin
, arg
, sizeof(*gfin
))) {
1578 outsize
= sizeof(*gfout
) + gfin
->gf_pathlen
;
1579 OBD_ALLOC(gfout
, outsize
);
1580 if (gfout
== NULL
) {
1584 memcpy(gfout
, gfin
, sizeof(*gfout
));
1587 /* Call mdc_iocontrol */
1588 rc
= obd_iocontrol(OBD_IOC_FID2PATH
, exp
, outsize
, gfout
, NULL
);
1592 if (copy_to_user(arg
, gfout
, outsize
))
1596 OBD_FREE(gfout
, outsize
);
1600 static int ll_ioctl_fiemap(struct inode
*inode
, unsigned long arg
)
1602 struct ll_user_fiemap
*fiemap_s
;
1603 size_t num_bytes
, ret_bytes
;
1604 unsigned int extent_count
;
1607 /* Get the extent count so we can calculate the size of
1608 * required fiemap buffer */
1609 if (get_user(extent_count
,
1610 &((struct ll_user_fiemap __user
*)arg
)->fm_extent_count
))
1612 num_bytes
= sizeof(*fiemap_s
) + (extent_count
*
1613 sizeof(struct ll_fiemap_extent
));
1615 OBD_ALLOC_LARGE(fiemap_s
, num_bytes
);
1616 if (fiemap_s
== NULL
)
1619 /* get the fiemap value */
1620 if (copy_from_user(fiemap_s
, (struct ll_user_fiemap __user
*)arg
,
1622 GOTO(error
, rc
= -EFAULT
);
1624 /* If fm_extent_count is non-zero, read the first extent since
1625 * it is used to calculate end_offset and device from previous
1628 if (copy_from_user(&fiemap_s
->fm_extents
[0],
1629 (char __user
*)arg
+ sizeof(*fiemap_s
),
1630 sizeof(struct ll_fiemap_extent
)))
1631 GOTO(error
, rc
= -EFAULT
);
1634 rc
= ll_do_fiemap(inode
, fiemap_s
, num_bytes
);
1638 ret_bytes
= sizeof(struct ll_user_fiemap
);
1640 if (extent_count
!= 0)
1641 ret_bytes
+= (fiemap_s
->fm_mapped_extents
*
1642 sizeof(struct ll_fiemap_extent
));
1644 if (copy_to_user((void *)arg
, fiemap_s
, ret_bytes
))
1648 OBD_FREE_LARGE(fiemap_s
, num_bytes
);
1653 * Read the data_version for inode.
1655 * This value is computed using stripe object version on OST.
1656 * Version is computed using server side locking.
1658 * @param extent_lock Take extent lock. Not needed if a process is already
1659 * holding the OST object group locks.
1661 int ll_data_version(struct inode
*inode
, __u64
*data_version
,
1664 struct lov_stripe_md
*lsm
= NULL
;
1665 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
1666 struct obdo
*obdo
= NULL
;
1669 /* If no stripe, we consider version is 0. */
1670 lsm
= ccc_inode_lsm_get(inode
);
1671 if (!lsm_has_objects(lsm
)) {
1673 CDEBUG(D_INODE
, "No object for inode\n");
1677 OBD_ALLOC_PTR(obdo
);
1679 GOTO(out
, rc
= -ENOMEM
);
1681 rc
= ll_lsm_getattr(lsm
, sbi
->ll_dt_exp
, NULL
, obdo
, 0, extent_lock
);
1683 if (!(obdo
->o_valid
& OBD_MD_FLDATAVERSION
))
1686 *data_version
= obdo
->o_data_version
;
1691 ccc_inode_lsm_put(inode
, lsm
);
1695 struct ll_swap_stack
{
1696 struct iattr ia1
, ia2
;
1698 struct inode
*inode1
, *inode2
;
1699 bool check_dv1
, check_dv2
;
1702 static int ll_swap_layouts(struct file
*file1
, struct file
*file2
,
1703 struct lustre_swap_layouts
*lsl
)
1705 struct mdc_swap_layouts msl
;
1706 struct md_op_data
*op_data
;
1709 struct ll_swap_stack
*llss
= NULL
;
1712 OBD_ALLOC_PTR(llss
);
1716 llss
->inode1
= file1
->f_dentry
->d_inode
;
1717 llss
->inode2
= file2
->f_dentry
->d_inode
;
1719 if (!S_ISREG(llss
->inode2
->i_mode
))
1720 GOTO(free
, rc
= -EINVAL
);
1722 if (inode_permission(llss
->inode1
, MAY_WRITE
) ||
1723 inode_permission(llss
->inode2
, MAY_WRITE
))
1724 GOTO(free
, rc
= -EPERM
);
1726 if (llss
->inode2
->i_sb
!= llss
->inode1
->i_sb
)
1727 GOTO(free
, rc
= -EXDEV
);
1729 /* we use 2 bool because it is easier to swap than 2 bits */
1730 if (lsl
->sl_flags
& SWAP_LAYOUTS_CHECK_DV1
)
1731 llss
->check_dv1
= true;
1733 if (lsl
->sl_flags
& SWAP_LAYOUTS_CHECK_DV2
)
1734 llss
->check_dv2
= true;
1736 /* we cannot use lsl->sl_dvX directly because we may swap them */
1737 llss
->dv1
= lsl
->sl_dv1
;
1738 llss
->dv2
= lsl
->sl_dv2
;
1740 rc
= lu_fid_cmp(ll_inode2fid(llss
->inode1
), ll_inode2fid(llss
->inode2
));
1741 if (rc
== 0) /* same file, done! */
1744 if (rc
< 0) { /* sequentialize it */
1745 swap(llss
->inode1
, llss
->inode2
);
1747 swap(llss
->dv1
, llss
->dv2
);
1748 swap(llss
->check_dv1
, llss
->check_dv2
);
1752 if (gid
!= 0) { /* application asks to flush dirty cache */
1753 rc
= ll_get_grouplock(llss
->inode1
, file1
, gid
);
1757 rc
= ll_get_grouplock(llss
->inode2
, file2
, gid
);
1759 ll_put_grouplock(llss
->inode1
, file1
, gid
);
1764 /* to be able to restore mtime and atime after swap
1765 * we need to first save them */
1767 (SWAP_LAYOUTS_KEEP_MTIME
| SWAP_LAYOUTS_KEEP_ATIME
)) {
1768 llss
->ia1
.ia_mtime
= llss
->inode1
->i_mtime
;
1769 llss
->ia1
.ia_atime
= llss
->inode1
->i_atime
;
1770 llss
->ia1
.ia_valid
= ATTR_MTIME
| ATTR_ATIME
;
1771 llss
->ia2
.ia_mtime
= llss
->inode2
->i_mtime
;
1772 llss
->ia2
.ia_atime
= llss
->inode2
->i_atime
;
1773 llss
->ia2
.ia_valid
= ATTR_MTIME
| ATTR_ATIME
;
1776 /* ultimate check, before swaping the layouts we check if
1777 * dataversion has changed (if requested) */
1778 if (llss
->check_dv1
) {
1779 rc
= ll_data_version(llss
->inode1
, &dv
, 0);
1782 if (dv
!= llss
->dv1
)
1783 GOTO(putgl
, rc
= -EAGAIN
);
1786 if (llss
->check_dv2
) {
1787 rc
= ll_data_version(llss
->inode2
, &dv
, 0);
1790 if (dv
!= llss
->dv2
)
1791 GOTO(putgl
, rc
= -EAGAIN
);
1794 /* struct md_op_data is used to send the swap args to the mdt
1795 * only flags is missing, so we use struct mdc_swap_layouts
1796 * through the md_op_data->op_data */
1797 /* flags from user space have to be converted before they are send to
1798 * server, no flag is sent today, they are only used on the client */
1801 op_data
= ll_prep_md_op_data(NULL
, llss
->inode1
, llss
->inode2
, NULL
, 0,
1802 0, LUSTRE_OPC_ANY
, &msl
);
1803 if (IS_ERR(op_data
))
1804 GOTO(free
, rc
= PTR_ERR(op_data
));
1806 rc
= obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS
, ll_i2mdexp(llss
->inode1
),
1807 sizeof(*op_data
), op_data
, NULL
);
1808 ll_finish_md_op_data(op_data
);
1812 ll_put_grouplock(llss
->inode2
, file2
, gid
);
1813 ll_put_grouplock(llss
->inode1
, file1
, gid
);
1816 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1820 /* clear useless flags */
1821 if (!(lsl
->sl_flags
& SWAP_LAYOUTS_KEEP_MTIME
)) {
1822 llss
->ia1
.ia_valid
&= ~ATTR_MTIME
;
1823 llss
->ia2
.ia_valid
&= ~ATTR_MTIME
;
1826 if (!(lsl
->sl_flags
& SWAP_LAYOUTS_KEEP_ATIME
)) {
1827 llss
->ia1
.ia_valid
&= ~ATTR_ATIME
;
1828 llss
->ia2
.ia_valid
&= ~ATTR_ATIME
;
1831 /* update time if requested */
1833 if (llss
->ia2
.ia_valid
!= 0) {
1834 mutex_lock(&llss
->inode1
->i_mutex
);
1835 rc
= ll_setattr(file1
->f_dentry
, &llss
->ia2
);
1836 mutex_unlock(&llss
->inode1
->i_mutex
);
1839 if (llss
->ia1
.ia_valid
!= 0) {
1842 mutex_lock(&llss
->inode2
->i_mutex
);
1843 rc1
= ll_setattr(file2
->f_dentry
, &llss
->ia1
);
1844 mutex_unlock(&llss
->inode2
->i_mutex
);
1856 long ll_file_ioctl(struct file
*file
, unsigned int cmd
, unsigned long arg
)
1858 struct inode
*inode
= file
->f_dentry
->d_inode
;
1859 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1862 CDEBUG(D_VFSTRACE
, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode
->i_ino
,
1863 inode
->i_generation
, inode
, cmd
);
1864 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_IOCTL
, 1);
1866 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1867 if (_IOC_TYPE(cmd
) == 'T' || _IOC_TYPE(cmd
) == 't') /* tty ioctls */
1871 case LL_IOC_GETFLAGS
:
1872 /* Get the current value of the file flags */
1873 return put_user(fd
->fd_flags
, (int *)arg
);
1874 case LL_IOC_SETFLAGS
:
1875 case LL_IOC_CLRFLAGS
:
1876 /* Set or clear specific file flags */
1877 /* XXX This probably needs checks to ensure the flags are
1878 * not abused, and to handle any flag side effects.
1880 if (get_user(flags
, (int *) arg
))
1883 if (cmd
== LL_IOC_SETFLAGS
) {
1884 if ((flags
& LL_FILE_IGNORE_LOCK
) &&
1885 !(file
->f_flags
& O_DIRECT
)) {
1886 CERROR("%s: unable to disable locking on "
1887 "non-O_DIRECT file\n", current
->comm
);
1891 fd
->fd_flags
|= flags
;
1893 fd
->fd_flags
&= ~flags
;
1896 case LL_IOC_LOV_SETSTRIPE
:
1897 return ll_lov_setstripe(inode
, file
, arg
);
1898 case LL_IOC_LOV_SETEA
:
1899 return ll_lov_setea(inode
, file
, arg
);
1900 case LL_IOC_LOV_SWAP_LAYOUTS
: {
1902 struct lustre_swap_layouts lsl
;
1904 if (copy_from_user(&lsl
, (char *)arg
,
1905 sizeof(struct lustre_swap_layouts
)))
1908 if ((file
->f_flags
& O_ACCMODE
) == 0) /* O_RDONLY */
1911 file2
= fget(lsl
.sl_fd
);
1916 if ((file2
->f_flags
& O_ACCMODE
) != 0) /* O_WRONLY or O_RDWR */
1917 rc
= ll_swap_layouts(file
, file2
, &lsl
);
1921 case LL_IOC_LOV_GETSTRIPE
:
1922 return ll_lov_getstripe(inode
, arg
);
1923 case LL_IOC_RECREATE_OBJ
:
1924 return ll_lov_recreate_obj(inode
, arg
);
1925 case LL_IOC_RECREATE_FID
:
1926 return ll_lov_recreate_fid(inode
, arg
);
1927 case FSFILT_IOC_FIEMAP
:
1928 return ll_ioctl_fiemap(inode
, arg
);
1929 case FSFILT_IOC_GETFLAGS
:
1930 case FSFILT_IOC_SETFLAGS
:
1931 return ll_iocontrol(inode
, file
, cmd
, arg
);
1932 case FSFILT_IOC_GETVERSION_OLD
:
1933 case FSFILT_IOC_GETVERSION
:
1934 return put_user(inode
->i_generation
, (int *)arg
);
1935 case LL_IOC_GROUP_LOCK
:
1936 return ll_get_grouplock(inode
, file
, arg
);
1937 case LL_IOC_GROUP_UNLOCK
:
1938 return ll_put_grouplock(inode
, file
, arg
);
1939 case IOC_OBD_STATFS
:
1940 return ll_obd_statfs(inode
, (void *)arg
);
1942 /* We need to special case any other ioctls we want to handle,
1943 * to send them to the MDS/OST as appropriate and to properly
1944 * network encode the arg field.
1945 case FSFILT_IOC_SETVERSION_OLD:
1946 case FSFILT_IOC_SETVERSION:
1948 case LL_IOC_FLUSHCTX
:
1949 return ll_flush_ctx(inode
);
1950 case LL_IOC_PATH2FID
: {
1951 if (copy_to_user((void *)arg
, ll_inode2fid(inode
),
1952 sizeof(struct lu_fid
)))
1957 case OBD_IOC_FID2PATH
:
1958 return ll_fid2path(inode
, (void *)arg
);
1959 case LL_IOC_DATA_VERSION
: {
1960 struct ioc_data_version idv
;
1963 if (copy_from_user(&idv
, (char *)arg
, sizeof(idv
)))
1966 rc
= ll_data_version(inode
, &idv
.idv_version
,
1967 !(idv
.idv_flags
& LL_DV_NOFLUSH
));
1969 if (rc
== 0 && copy_to_user((char *) arg
, &idv
, sizeof(idv
)))
1975 case LL_IOC_GET_MDTIDX
: {
1978 mdtidx
= ll_get_mdt_idx(inode
);
1982 if (put_user((int)mdtidx
, (int*)arg
))
1987 case OBD_IOC_GETDTNAME
:
1988 case OBD_IOC_GETMDNAME
:
1989 return ll_get_obd_name(inode
, cmd
, arg
);
1990 case LL_IOC_HSM_STATE_GET
: {
1991 struct md_op_data
*op_data
;
1992 struct hsm_user_state
*hus
;
1999 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
2000 LUSTRE_OPC_ANY
, hus
);
2001 if (IS_ERR(op_data
)) {
2003 return PTR_ERR(op_data
);
2006 rc
= obd_iocontrol(cmd
, ll_i2mdexp(inode
), sizeof(*op_data
),
2009 if (copy_to_user((void *)arg
, hus
, sizeof(*hus
)))
2012 ll_finish_md_op_data(op_data
);
2016 case LL_IOC_HSM_STATE_SET
: {
2017 struct md_op_data
*op_data
;
2018 struct hsm_state_set
*hss
;
2024 if (copy_from_user(hss
, (char *)arg
, sizeof(*hss
))) {
2029 /* Non-root users are forbidden to set or clear flags which are
2030 * NOT defined in HSM_USER_MASK. */
2031 if (((hss
->hss_setmask
| hss
->hss_clearmask
) & ~HSM_USER_MASK
)
2032 && !cfs_capable(CFS_CAP_SYS_ADMIN
)) {
2037 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
2038 LUSTRE_OPC_ANY
, hss
);
2039 if (IS_ERR(op_data
)) {
2041 return PTR_ERR(op_data
);
2044 rc
= obd_iocontrol(cmd
, ll_i2mdexp(inode
), sizeof(*op_data
),
2047 ll_finish_md_op_data(op_data
);
2052 case LL_IOC_HSM_ACTION
: {
2053 struct md_op_data
*op_data
;
2054 struct hsm_current_action
*hca
;
2061 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
2062 LUSTRE_OPC_ANY
, hca
);
2063 if (IS_ERR(op_data
)) {
2065 return PTR_ERR(op_data
);
2068 rc
= obd_iocontrol(cmd
, ll_i2mdexp(inode
), sizeof(*op_data
),
2071 if (copy_to_user((char *)arg
, hca
, sizeof(*hca
)))
2074 ll_finish_md_op_data(op_data
);
2082 ll_iocontrol_call(inode
, file
, cmd
, arg
, &err
))
2085 return obd_iocontrol(cmd
, ll_i2dtexp(inode
), 0, NULL
,
2092 loff_t
ll_file_seek(struct file
*file
, loff_t offset
, int origin
)
2094 struct inode
*inode
= file
->f_dentry
->d_inode
;
2095 loff_t retval
, eof
= 0;
2097 retval
= offset
+ ((origin
== SEEK_END
) ? i_size_read(inode
) :
2098 (origin
== SEEK_CUR
) ? file
->f_pos
: 0);
2099 CDEBUG(D_VFSTRACE
, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2100 inode
->i_ino
, inode
->i_generation
, inode
, retval
, retval
,
2102 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_LLSEEK
, 1);
2104 if (origin
== SEEK_END
|| origin
== SEEK_HOLE
|| origin
== SEEK_DATA
) {
2105 retval
= ll_glimpse_size(inode
);
2108 eof
= i_size_read(inode
);
2111 retval
= generic_file_llseek_size(file
, offset
, origin
,
2112 ll_file_maxbytes(inode
), eof
);
2116 int ll_flush(struct file
*file
, fl_owner_t id
)
2118 struct inode
*inode
= file
->f_dentry
->d_inode
;
2119 struct ll_inode_info
*lli
= ll_i2info(inode
);
2120 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
2123 LASSERT(!S_ISDIR(inode
->i_mode
));
2125 /* catch async errors that were recorded back when async writeback
2126 * failed for pages in this mapping. */
2127 rc
= lli
->lli_async_rc
;
2128 lli
->lli_async_rc
= 0;
2129 err
= lov_read_and_clear_async_rc(lli
->lli_clob
);
2133 /* The application has been told write failure already.
2134 * Do not report failure again. */
2135 if (fd
->fd_write_failed
)
2137 return rc
? -EIO
: 0;
2141 * Called to make sure a portion of file has been written out.
2142 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2144 * Return how many pages have been written.
2146 int cl_sync_file_range(struct inode
*inode
, loff_t start
, loff_t end
,
2147 enum cl_fsync_mode mode
, int ignore_layout
)
2149 struct cl_env_nest nest
;
2152 struct obd_capa
*capa
= NULL
;
2153 struct cl_fsync_io
*fio
;
2156 if (mode
!= CL_FSYNC_NONE
&& mode
!= CL_FSYNC_LOCAL
&&
2157 mode
!= CL_FSYNC_DISCARD
&& mode
!= CL_FSYNC_ALL
)
2160 env
= cl_env_nested_get(&nest
);
2162 return PTR_ERR(env
);
2164 capa
= ll_osscapa_get(inode
, CAPA_OPC_OSS_WRITE
);
2166 io
= ccc_env_thread_io(env
);
2167 io
->ci_obj
= cl_i2info(inode
)->lli_clob
;
2168 io
->ci_ignore_layout
= ignore_layout
;
2170 /* initialize parameters for sync */
2171 fio
= &io
->u
.ci_fsync
;
2172 fio
->fi_capa
= capa
;
2173 fio
->fi_start
= start
;
2175 fio
->fi_fid
= ll_inode2fid(inode
);
2176 fio
->fi_mode
= mode
;
2177 fio
->fi_nr_written
= 0;
2179 if (cl_io_init(env
, io
, CIT_FSYNC
, io
->ci_obj
) == 0)
2180 result
= cl_io_loop(env
, io
);
2182 result
= io
->ci_result
;
2184 result
= fio
->fi_nr_written
;
2185 cl_io_fini(env
, io
);
2186 cl_env_nested_put(&nest
, env
);
2194 * When dentry is provided (the 'else' case), *file->f_dentry may be
2195 * null and dentry must be used directly rather than pulled from
2196 * *file->f_dentry as is done otherwise.
2199 int ll_fsync(struct file
*file
, loff_t start
, loff_t end
, int datasync
)
2201 struct dentry
*dentry
= file
->f_dentry
;
2202 struct inode
*inode
= dentry
->d_inode
;
2203 struct ll_inode_info
*lli
= ll_i2info(inode
);
2204 struct ptlrpc_request
*req
;
2205 struct obd_capa
*oc
;
2208 CDEBUG(D_VFSTRACE
, "VFS Op:inode=%lu/%u(%p)\n", inode
->i_ino
,
2209 inode
->i_generation
, inode
);
2210 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_FSYNC
, 1);
2212 rc
= filemap_write_and_wait_range(inode
->i_mapping
, start
, end
);
2213 mutex_lock(&inode
->i_mutex
);
2215 /* catch async errors that were recorded back when async writeback
2216 * failed for pages in this mapping. */
2217 if (!S_ISDIR(inode
->i_mode
)) {
2218 err
= lli
->lli_async_rc
;
2219 lli
->lli_async_rc
= 0;
2222 err
= lov_read_and_clear_async_rc(lli
->lli_clob
);
2227 oc
= ll_mdscapa_get(inode
);
2228 err
= md_sync(ll_i2sbi(inode
)->ll_md_exp
, ll_inode2fid(inode
), oc
,
2234 ptlrpc_req_finished(req
);
2236 if (datasync
&& S_ISREG(inode
->i_mode
)) {
2237 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
2239 err
= cl_sync_file_range(inode
, 0, OBD_OBJECT_EOF
,
2241 if (rc
== 0 && err
< 0)
2244 fd
->fd_write_failed
= true;
2246 fd
->fd_write_failed
= false;
2249 mutex_unlock(&inode
->i_mutex
);
2253 int ll_file_flock(struct file
*file
, int cmd
, struct file_lock
*file_lock
)
2255 struct inode
*inode
= file
->f_dentry
->d_inode
;
2256 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
2257 struct ldlm_enqueue_info einfo
= {
2258 .ei_type
= LDLM_FLOCK
,
2259 .ei_cb_cp
= ldlm_flock_completion_ast
,
2260 .ei_cbdata
= file_lock
,
2262 struct md_op_data
*op_data
;
2263 struct lustre_handle lockh
= {0};
2264 ldlm_policy_data_t flock
= {{0}};
2269 CDEBUG(D_VFSTRACE
, "VFS Op:inode=%lu file_lock=%p\n",
2270 inode
->i_ino
, file_lock
);
2272 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_FLOCK
, 1);
2274 if (file_lock
->fl_flags
& FL_FLOCK
) {
2275 LASSERT((cmd
== F_SETLKW
) || (cmd
== F_SETLK
));
2276 /* flocks are whole-file locks */
2277 flock
.l_flock
.end
= OFFSET_MAX
;
2278 /* For flocks owner is determined by the local file desctiptor*/
2279 flock
.l_flock
.owner
= (unsigned long)file_lock
->fl_file
;
2280 } else if (file_lock
->fl_flags
& FL_POSIX
) {
2281 flock
.l_flock
.owner
= (unsigned long)file_lock
->fl_owner
;
2282 flock
.l_flock
.start
= file_lock
->fl_start
;
2283 flock
.l_flock
.end
= file_lock
->fl_end
;
2287 flock
.l_flock
.pid
= file_lock
->fl_pid
;
2289 /* Somewhat ugly workaround for svc lockd.
2290 * lockd installs custom fl_lmops->lm_compare_owner that checks
2291 * for the fl_owner to be the same (which it always is on local node
2292 * I guess between lockd processes) and then compares pid.
2293 * As such we assign pid to the owner field to make it all work,
2294 * conflict with normal locks is unlikely since pid space and
2295 * pointer space for current->files are not intersecting */
2296 if (file_lock
->fl_lmops
&& file_lock
->fl_lmops
->lm_compare_owner
)
2297 flock
.l_flock
.owner
= (unsigned long)file_lock
->fl_pid
;
2299 switch (file_lock
->fl_type
) {
2301 einfo
.ei_mode
= LCK_PR
;
2304 /* An unlock request may or may not have any relation to
2305 * existing locks so we may not be able to pass a lock handle
2306 * via a normal ldlm_lock_cancel() request. The request may even
2307 * unlock a byte range in the middle of an existing lock. In
2308 * order to process an unlock request we need all of the same
2309 * information that is given with a normal read or write record
2310 * lock request. To avoid creating another ldlm unlock (cancel)
2311 * message we'll treat a LCK_NL flock request as an unlock. */
2312 einfo
.ei_mode
= LCK_NL
;
2315 einfo
.ei_mode
= LCK_PW
;
2318 CDEBUG(D_INFO
, "Unknown fcntl lock type: %d\n",
2319 file_lock
->fl_type
);
2334 flags
= LDLM_FL_BLOCK_NOWAIT
;
2340 flags
= LDLM_FL_TEST_LOCK
;
2341 /* Save the old mode so that if the mode in the lock changes we
2342 * can decrement the appropriate reader or writer refcount. */
2343 file_lock
->fl_type
= einfo
.ei_mode
;
2346 CERROR("unknown fcntl lock command: %d\n", cmd
);
2350 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
2351 LUSTRE_OPC_ANY
, NULL
);
2352 if (IS_ERR(op_data
))
2353 return PTR_ERR(op_data
);
2355 CDEBUG(D_DLMTRACE
, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2356 "start="LPU64
", end="LPU64
"\n", inode
->i_ino
, flock
.l_flock
.pid
,
2357 flags
, einfo
.ei_mode
, flock
.l_flock
.start
, flock
.l_flock
.end
);
2359 rc
= md_enqueue(sbi
->ll_md_exp
, &einfo
, NULL
,
2360 op_data
, &lockh
, &flock
, 0, NULL
/* req */, flags
);
2362 if ((file_lock
->fl_flags
& FL_FLOCK
) &&
2363 (rc
== 0 || file_lock
->fl_type
== F_UNLCK
))
2364 rc2
= flock_lock_file_wait(file
, file_lock
);
2365 if ((file_lock
->fl_flags
& FL_POSIX
) &&
2366 (rc
== 0 || file_lock
->fl_type
== F_UNLCK
) &&
2367 !(flags
& LDLM_FL_TEST_LOCK
))
2368 rc2
= posix_lock_file_wait(file
, file_lock
);
2370 if (rc2
&& file_lock
->fl_type
!= F_UNLCK
) {
2371 einfo
.ei_mode
= LCK_NL
;
2372 md_enqueue(sbi
->ll_md_exp
, &einfo
, NULL
,
2373 op_data
, &lockh
, &flock
, 0, NULL
/* req */, flags
);
2377 ll_finish_md_op_data(op_data
);
2382 int ll_file_noflock(struct file
*file
, int cmd
, struct file_lock
*file_lock
)
2388 * test if some locks matching bits and l_req_mode are acquired
2389 * - bits can be in different locks
2390 * - if found clear the common lock bits in *bits
2391 * - the bits not found, are kept in *bits
2393 * \param bits [IN] searched lock bits [IN]
2394 * \param l_req_mode [IN] searched lock mode
2395 * \retval boolean, true iff all bits are found
2397 int ll_have_md_lock(struct inode
*inode
, __u64
*bits
, ldlm_mode_t l_req_mode
)
2399 struct lustre_handle lockh
;
2400 ldlm_policy_data_t policy
;
2401 ldlm_mode_t mode
= (l_req_mode
== LCK_MINMODE
) ?
2402 (LCK_CR
|LCK_CW
|LCK_PR
|LCK_PW
) : l_req_mode
;
2410 fid
= &ll_i2info(inode
)->lli_fid
;
2411 CDEBUG(D_INFO
, "trying to match res "DFID
" mode %s\n", PFID(fid
),
2412 ldlm_lockname
[mode
]);
2414 flags
= LDLM_FL_BLOCK_GRANTED
| LDLM_FL_CBPENDING
| LDLM_FL_TEST_LOCK
;
2415 for (i
= 0; i
<= MDS_INODELOCK_MAXSHIFT
&& *bits
!= 0; i
++) {
2416 policy
.l_inodebits
.bits
= *bits
& (1 << i
);
2417 if (policy
.l_inodebits
.bits
== 0)
2420 if (md_lock_match(ll_i2mdexp(inode
), flags
, fid
, LDLM_IBITS
,
2421 &policy
, mode
, &lockh
)) {
2422 struct ldlm_lock
*lock
;
2424 lock
= ldlm_handle2lock(&lockh
);
2427 ~(lock
->l_policy_data
.l_inodebits
.bits
);
2428 LDLM_LOCK_PUT(lock
);
2430 *bits
&= ~policy
.l_inodebits
.bits
;
2437 ldlm_mode_t
ll_take_md_lock(struct inode
*inode
, __u64 bits
,
2438 struct lustre_handle
*lockh
, __u64 flags
)
2440 ldlm_policy_data_t policy
= { .l_inodebits
= {bits
}};
2444 fid
= &ll_i2info(inode
)->lli_fid
;
2445 CDEBUG(D_INFO
, "trying to match res "DFID
"\n", PFID(fid
));
2447 rc
= md_lock_match(ll_i2mdexp(inode
), LDLM_FL_BLOCK_GRANTED
|flags
,
2448 fid
, LDLM_IBITS
, &policy
,
2449 LCK_CR
|LCK_CW
|LCK_PR
|LCK_PW
, lockh
);
2453 static int ll_inode_revalidate_fini(struct inode
*inode
, int rc
)
2455 /* Already unlinked. Just update nlink and return success */
2456 if (rc
== -ENOENT
) {
2458 /* This path cannot be hit for regular files unless in
2459 * case of obscure races, so no need to to validate
2461 if (!S_ISREG(inode
->i_mode
) && !S_ISDIR(inode
->i_mode
))
2463 } else if (rc
!= 0) {
2464 CERROR("%s: revalidate FID "DFID
" error: rc = %d\n",
2465 ll_get_fsname(inode
->i_sb
, NULL
, 0),
2466 PFID(ll_inode2fid(inode
)), rc
);
2472 int __ll_inode_revalidate_it(struct dentry
*dentry
, struct lookup_intent
*it
,
2475 struct inode
*inode
= dentry
->d_inode
;
2476 struct ptlrpc_request
*req
= NULL
;
2477 struct obd_export
*exp
;
2480 LASSERT(inode
!= NULL
);
2482 CDEBUG(D_VFSTRACE
, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2483 inode
->i_ino
, inode
->i_generation
, inode
, dentry
->d_name
.name
);
2485 exp
= ll_i2mdexp(inode
);
2487 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2488 * But under CMD case, it caused some lock issues, should be fixed
2489 * with new CMD ibits lock. See bug 12718 */
2490 if (exp_connect_flags(exp
) & OBD_CONNECT_ATTRFID
) {
2491 struct lookup_intent oit
= { .it_op
= IT_GETATTR
};
2492 struct md_op_data
*op_data
;
2494 if (ibits
== MDS_INODELOCK_LOOKUP
)
2495 oit
.it_op
= IT_LOOKUP
;
2497 /* Call getattr by fid, so do not provide name at all. */
2498 op_data
= ll_prep_md_op_data(NULL
, dentry
->d_parent
->d_inode
,
2499 dentry
->d_inode
, NULL
, 0, 0,
2500 LUSTRE_OPC_ANY
, NULL
);
2501 if (IS_ERR(op_data
))
2502 return PTR_ERR(op_data
);
2504 oit
.it_create_mode
|= M_CHECK_STALE
;
2505 rc
= md_intent_lock(exp
, op_data
, NULL
, 0,
2506 /* we are not interested in name
2509 ll_md_blocking_ast
, 0);
2510 ll_finish_md_op_data(op_data
);
2511 oit
.it_create_mode
&= ~M_CHECK_STALE
;
2513 rc
= ll_inode_revalidate_fini(inode
, rc
);
2517 rc
= ll_revalidate_it_finish(req
, &oit
, dentry
);
2519 ll_intent_release(&oit
);
2523 /* Unlinked? Unhash dentry, so it is not picked up later by
2524 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2525 here to preserve get_cwd functionality on 2.6.
2527 if (!dentry
->d_inode
->i_nlink
)
2528 d_lustre_invalidate(dentry
, 0);
2530 ll_lookup_finish_locks(&oit
, dentry
);
2531 } else if (!ll_have_md_lock(dentry
->d_inode
, &ibits
, LCK_MINMODE
)) {
2532 struct ll_sb_info
*sbi
= ll_i2sbi(dentry
->d_inode
);
2533 obd_valid valid
= OBD_MD_FLGETATTR
;
2534 struct md_op_data
*op_data
;
2537 if (S_ISREG(inode
->i_mode
)) {
2538 rc
= ll_get_max_mdsize(sbi
, &ealen
);
2541 valid
|= OBD_MD_FLEASIZE
| OBD_MD_FLMODEASIZE
;
2544 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
,
2545 0, ealen
, LUSTRE_OPC_ANY
,
2547 if (IS_ERR(op_data
))
2548 return PTR_ERR(op_data
);
2550 op_data
->op_valid
= valid
;
2551 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2552 * capa for this inode. Because we only keep capas of dirs
2554 rc
= md_getattr(sbi
->ll_md_exp
, op_data
, &req
);
2555 ll_finish_md_op_data(op_data
);
2557 rc
= ll_inode_revalidate_fini(inode
, rc
);
2561 rc
= ll_prep_inode(&inode
, req
, NULL
, NULL
);
2564 ptlrpc_req_finished(req
);
2568 int ll_inode_revalidate_it(struct dentry
*dentry
, struct lookup_intent
*it
,
2571 struct inode
*inode
= dentry
->d_inode
;
2574 rc
= __ll_inode_revalidate_it(dentry
, it
, ibits
);
2578 /* if object isn't regular file, don't validate size */
2579 if (!S_ISREG(inode
->i_mode
)) {
2580 LTIME_S(inode
->i_atime
) = ll_i2info(inode
)->lli_lvb
.lvb_atime
;
2581 LTIME_S(inode
->i_mtime
) = ll_i2info(inode
)->lli_lvb
.lvb_mtime
;
2582 LTIME_S(inode
->i_ctime
) = ll_i2info(inode
)->lli_lvb
.lvb_ctime
;
2584 rc
= ll_glimpse_size(inode
);
2589 int ll_getattr_it(struct vfsmount
*mnt
, struct dentry
*de
,
2590 struct lookup_intent
*it
, struct kstat
*stat
)
2592 struct inode
*inode
= de
->d_inode
;
2593 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
2594 struct ll_inode_info
*lli
= ll_i2info(inode
);
2597 res
= ll_inode_revalidate_it(de
, it
, MDS_INODELOCK_UPDATE
|
2598 MDS_INODELOCK_LOOKUP
);
2599 ll_stats_ops_tally(sbi
, LPROC_LL_GETATTR
, 1);
2604 stat
->dev
= inode
->i_sb
->s_dev
;
2605 if (ll_need_32bit_api(sbi
))
2606 stat
->ino
= cl_fid_build_ino(&lli
->lli_fid
, 1);
2608 stat
->ino
= inode
->i_ino
;
2609 stat
->mode
= inode
->i_mode
;
2610 stat
->nlink
= inode
->i_nlink
;
2611 stat
->uid
= inode
->i_uid
;
2612 stat
->gid
= inode
->i_gid
;
2613 stat
->rdev
= inode
->i_rdev
;
2614 stat
->atime
= inode
->i_atime
;
2615 stat
->mtime
= inode
->i_mtime
;
2616 stat
->ctime
= inode
->i_ctime
;
2617 stat
->blksize
= 1 << inode
->i_blkbits
;
2619 stat
->size
= i_size_read(inode
);
2620 stat
->blocks
= inode
->i_blocks
;
2624 int ll_getattr(struct vfsmount
*mnt
, struct dentry
*de
, struct kstat
*stat
)
2626 struct lookup_intent it
= { .it_op
= IT_GETATTR
};
2628 return ll_getattr_it(mnt
, de
, &it
, stat
);
2632 struct posix_acl
* ll_get_acl(struct inode
*inode
, int type
)
2634 struct ll_inode_info
*lli
= ll_i2info(inode
);
2635 struct posix_acl
*acl
= NULL
;
2637 spin_lock(&lli
->lli_lock
);
2638 /* VFS' acl_permission_check->check_acl will release the refcount */
2639 acl
= posix_acl_dup(lli
->lli_posix_acl
);
2640 spin_unlock(&lli
->lli_lock
);
2646 int ll_inode_permission(struct inode
*inode
, int mask
)
2650 #ifdef MAY_NOT_BLOCK
2651 if (mask
& MAY_NOT_BLOCK
)
2655 /* as root inode are NOT getting validated in lookup operation,
2656 * need to do it before permission check. */
2658 if (inode
== inode
->i_sb
->s_root
->d_inode
) {
2659 struct lookup_intent it
= { .it_op
= IT_LOOKUP
};
2661 rc
= __ll_inode_revalidate_it(inode
->i_sb
->s_root
, &it
,
2662 MDS_INODELOCK_LOOKUP
);
2667 CDEBUG(D_VFSTRACE
, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2668 inode
->i_ino
, inode
->i_generation
, inode
, inode
->i_mode
, mask
);
2670 if (ll_i2sbi(inode
)->ll_flags
& LL_SBI_RMT_CLIENT
)
2671 return lustre_check_remote_perm(inode
, mask
);
2673 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_INODE_PERM
, 1);
2674 rc
= generic_permission(inode
, mask
);
2679 #define READ_METHOD aio_read
2680 #define READ_FUNCTION ll_file_aio_read
2681 #define WRITE_METHOD aio_write
2682 #define WRITE_FUNCTION ll_file_aio_write
2684 /* -o localflock - only provides locally consistent flock locks */
2685 struct file_operations ll_file_operations
= {
2686 .read
= ll_file_read
,
2687 .READ_METHOD
= READ_FUNCTION
,
2688 .write
= ll_file_write
,
2689 .WRITE_METHOD
= WRITE_FUNCTION
,
2690 .unlocked_ioctl
= ll_file_ioctl
,
2691 .open
= ll_file_open
,
2692 .release
= ll_file_release
,
2693 .mmap
= ll_file_mmap
,
2694 .llseek
= ll_file_seek
,
2695 .splice_read
= ll_file_splice_read
,
2700 struct file_operations ll_file_operations_flock
= {
2701 .read
= ll_file_read
,
2702 .READ_METHOD
= READ_FUNCTION
,
2703 .write
= ll_file_write
,
2704 .WRITE_METHOD
= WRITE_FUNCTION
,
2705 .unlocked_ioctl
= ll_file_ioctl
,
2706 .open
= ll_file_open
,
2707 .release
= ll_file_release
,
2708 .mmap
= ll_file_mmap
,
2709 .llseek
= ll_file_seek
,
2710 .splice_read
= ll_file_splice_read
,
2713 .flock
= ll_file_flock
,
2714 .lock
= ll_file_flock
2717 /* These are for -o noflock - to return ENOSYS on flock calls */
2718 struct file_operations ll_file_operations_noflock
= {
2719 .read
= ll_file_read
,
2720 .READ_METHOD
= READ_FUNCTION
,
2721 .write
= ll_file_write
,
2722 .WRITE_METHOD
= WRITE_FUNCTION
,
2723 .unlocked_ioctl
= ll_file_ioctl
,
2724 .open
= ll_file_open
,
2725 .release
= ll_file_release
,
2726 .mmap
= ll_file_mmap
,
2727 .llseek
= ll_file_seek
,
2728 .splice_read
= ll_file_splice_read
,
2731 .flock
= ll_file_noflock
,
2732 .lock
= ll_file_noflock
2735 struct inode_operations ll_file_inode_operations
= {
2736 .setattr
= ll_setattr
,
2737 .getattr
= ll_getattr
,
2738 .permission
= ll_inode_permission
,
2739 .setxattr
= ll_setxattr
,
2740 .getxattr
= ll_getxattr
,
2741 .listxattr
= ll_listxattr
,
2742 .removexattr
= ll_removexattr
,
2743 .get_acl
= ll_get_acl
,
2746 /* dynamic ioctl number support routins */
2747 static struct llioc_ctl_data
{
2748 struct rw_semaphore ioc_sem
;
2749 struct list_head ioc_head
;
2751 __RWSEM_INITIALIZER(llioc
.ioc_sem
),
2752 LIST_HEAD_INIT(llioc
.ioc_head
)
2757 struct list_head iocd_list
;
2758 unsigned int iocd_size
;
2759 llioc_callback_t iocd_cb
;
2760 unsigned int iocd_count
;
2761 unsigned int iocd_cmd
[0];
2764 void *ll_iocontrol_register(llioc_callback_t cb
, int count
, unsigned int *cmd
)
2767 struct llioc_data
*in_data
= NULL
;
2769 if (cb
== NULL
|| cmd
== NULL
||
2770 count
> LLIOC_MAX_CMD
|| count
< 0)
2773 size
= sizeof(*in_data
) + count
* sizeof(unsigned int);
2774 OBD_ALLOC(in_data
, size
);
2775 if (in_data
== NULL
)
2778 memset(in_data
, 0, sizeof(*in_data
));
2779 in_data
->iocd_size
= size
;
2780 in_data
->iocd_cb
= cb
;
2781 in_data
->iocd_count
= count
;
2782 memcpy(in_data
->iocd_cmd
, cmd
, sizeof(unsigned int) * count
);
2784 down_write(&llioc
.ioc_sem
);
2785 list_add_tail(&in_data
->iocd_list
, &llioc
.ioc_head
);
2786 up_write(&llioc
.ioc_sem
);
2791 void ll_iocontrol_unregister(void *magic
)
2793 struct llioc_data
*tmp
;
2798 down_write(&llioc
.ioc_sem
);
2799 list_for_each_entry(tmp
, &llioc
.ioc_head
, iocd_list
) {
2801 unsigned int size
= tmp
->iocd_size
;
2803 list_del(&tmp
->iocd_list
);
2804 up_write(&llioc
.ioc_sem
);
2806 OBD_FREE(tmp
, size
);
2810 up_write(&llioc
.ioc_sem
);
2812 CWARN("didn't find iocontrol register block with magic: %p\n", magic
);
2815 EXPORT_SYMBOL(ll_iocontrol_register
);
2816 EXPORT_SYMBOL(ll_iocontrol_unregister
);
2818 enum llioc_iter
ll_iocontrol_call(struct inode
*inode
, struct file
*file
,
2819 unsigned int cmd
, unsigned long arg
, int *rcp
)
2821 enum llioc_iter ret
= LLIOC_CONT
;
2822 struct llioc_data
*data
;
2823 int rc
= -EINVAL
, i
;
2825 down_read(&llioc
.ioc_sem
);
2826 list_for_each_entry(data
, &llioc
.ioc_head
, iocd_list
) {
2827 for (i
= 0; i
< data
->iocd_count
; i
++) {
2828 if (cmd
!= data
->iocd_cmd
[i
])
2831 ret
= data
->iocd_cb(inode
, file
, cmd
, arg
, data
, &rc
);
2835 if (ret
== LLIOC_STOP
)
2838 up_read(&llioc
.ioc_sem
);
2845 int ll_layout_conf(struct inode
*inode
, const struct cl_object_conf
*conf
)
2847 struct ll_inode_info
*lli
= ll_i2info(inode
);
2848 struct cl_env_nest nest
;
2852 if (lli
->lli_clob
== NULL
)
2855 env
= cl_env_nested_get(&nest
);
2857 return PTR_ERR(env
);
2859 result
= cl_conf_set(env
, lli
->lli_clob
, conf
);
2860 cl_env_nested_put(&nest
, env
);
2862 if (conf
->coc_opc
== OBJECT_CONF_SET
) {
2863 struct ldlm_lock
*lock
= conf
->coc_lock
;
2865 LASSERT(lock
!= NULL
);
2866 LASSERT(ldlm_has_layout(lock
));
2868 /* it can only be allowed to match after layout is
2869 * applied to inode otherwise false layout would be
2870 * seen. Applying layout shoud happen before dropping
2871 * the intent lock. */
2872 ldlm_lock_allow_match(lock
);
2878 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
2879 static int ll_layout_fetch(struct inode
*inode
, struct ldlm_lock
*lock
)
2882 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
2883 struct obd_capa
*oc
;
2884 struct ptlrpc_request
*req
;
2885 struct mdt_body
*body
;
2891 CDEBUG(D_INODE
, DFID
" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
2892 PFID(ll_inode2fid(inode
)), !!(lock
->l_flags
& LDLM_FL_LVB_READY
),
2893 lock
->l_lvb_data
, lock
->l_lvb_len
);
2895 if ((lock
->l_lvb_data
!= NULL
) && (lock
->l_flags
& LDLM_FL_LVB_READY
))
2898 /* if layout lock was granted right away, the layout is returned
2899 * within DLM_LVB of dlm reply; otherwise if the lock was ever
2900 * blocked and then granted via completion ast, we have to fetch
2901 * layout here. Please note that we can't use the LVB buffer in
2902 * completion AST because it doesn't have a large enough buffer */
2903 oc
= ll_mdscapa_get(inode
);
2904 rc
= ll_get_max_mdsize(sbi
, &lmmsize
);
2906 rc
= md_getxattr(sbi
->ll_md_exp
, ll_inode2fid(inode
), oc
,
2907 OBD_MD_FLXATTR
, XATTR_NAME_LOV
, NULL
, 0,
2913 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
2914 if (body
== NULL
|| body
->eadatasize
> lmmsize
)
2915 GOTO(out
, rc
= -EPROTO
);
2917 lmmsize
= body
->eadatasize
;
2918 if (lmmsize
== 0) /* empty layout */
2921 lmm
= req_capsule_server_sized_get(&req
->rq_pill
, &RMF_EADATA
, lmmsize
);
2923 GOTO(out
, rc
= -EFAULT
);
2925 OBD_ALLOC_LARGE(lvbdata
, lmmsize
);
2926 if (lvbdata
== NULL
)
2927 GOTO(out
, rc
= -ENOMEM
);
2929 memcpy(lvbdata
, lmm
, lmmsize
);
2930 lock_res_and_lock(lock
);
2931 if (lock
->l_lvb_data
!= NULL
)
2932 OBD_FREE_LARGE(lock
->l_lvb_data
, lock
->l_lvb_len
);
2934 lock
->l_lvb_data
= lvbdata
;
2935 lock
->l_lvb_len
= lmmsize
;
2936 unlock_res_and_lock(lock
);
2939 ptlrpc_req_finished(req
);
2944 * Apply the layout to the inode. Layout lock is held and will be released
2947 static int ll_layout_lock_set(struct lustre_handle
*lockh
, ldlm_mode_t mode
,
2948 struct inode
*inode
, __u32
*gen
, bool reconf
)
2950 struct ll_inode_info
*lli
= ll_i2info(inode
);
2951 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
2952 struct ldlm_lock
*lock
;
2953 struct lustre_md md
= { NULL
};
2954 struct cl_object_conf conf
;
2957 bool wait_layout
= false;
2959 LASSERT(lustre_handle_is_used(lockh
));
2961 lock
= ldlm_handle2lock(lockh
);
2962 LASSERT(lock
!= NULL
);
2963 LASSERT(ldlm_has_layout(lock
));
2965 LDLM_DEBUG(lock
, "File %p/"DFID
" being reconfigured: %d.\n",
2966 inode
, PFID(&lli
->lli_fid
), reconf
);
2968 /* in case this is a caching lock and reinstate with new inode */
2969 md_set_lock_data(sbi
->ll_md_exp
, &lockh
->cookie
, inode
, NULL
);
2971 lock_res_and_lock(lock
);
2972 lvb_ready
= !!(lock
->l_flags
& LDLM_FL_LVB_READY
);
2973 unlock_res_and_lock(lock
);
2974 /* checking lvb_ready is racy but this is okay. The worst case is
2975 * that multi processes may configure the file on the same time. */
2976 if (lvb_ready
|| !reconf
) {
2979 /* layout_gen must be valid if layout lock is not
2980 * cancelled and stripe has already set */
2981 *gen
= lli
->lli_layout_gen
;
2987 rc
= ll_layout_fetch(inode
, lock
);
2991 /* for layout lock, lmm is returned in lock's lvb.
2992 * lvb_data is immutable if the lock is held so it's safe to access it
2993 * without res lock. See the description in ldlm_lock_decref_internal()
2994 * for the condition to free lvb_data of layout lock */
2995 if (lock
->l_lvb_data
!= NULL
) {
2996 rc
= obd_unpackmd(sbi
->ll_dt_exp
, &md
.lsm
,
2997 lock
->l_lvb_data
, lock
->l_lvb_len
);
2999 *gen
= LL_LAYOUT_GEN_EMPTY
;
3001 *gen
= md
.lsm
->lsm_layout_gen
;
3004 CERROR("%s: file "DFID
" unpackmd error: %d\n",
3005 ll_get_fsname(inode
->i_sb
, NULL
, 0),
3006 PFID(&lli
->lli_fid
), rc
);
3012 /* set layout to file. Unlikely this will fail as old layout was
3013 * surely eliminated */
3014 memset(&conf
, 0, sizeof conf
);
3015 conf
.coc_opc
= OBJECT_CONF_SET
;
3016 conf
.coc_inode
= inode
;
3017 conf
.coc_lock
= lock
;
3018 conf
.u
.coc_md
= &md
;
3019 rc
= ll_layout_conf(inode
, &conf
);
3022 obd_free_memmd(sbi
->ll_dt_exp
, &md
.lsm
);
3024 /* refresh layout failed, need to wait */
3025 wait_layout
= rc
== -EBUSY
;
3028 LDLM_LOCK_PUT(lock
);
3029 ldlm_lock_decref(lockh
, mode
);
3031 /* wait for IO to complete if it's still being used. */
3033 CDEBUG(D_INODE
, "%s: %p/"DFID
" wait for layout reconf.\n",
3034 ll_get_fsname(inode
->i_sb
, NULL
, 0),
3035 inode
, PFID(&lli
->lli_fid
));
3037 memset(&conf
, 0, sizeof conf
);
3038 conf
.coc_opc
= OBJECT_CONF_WAIT
;
3039 conf
.coc_inode
= inode
;
3040 rc
= ll_layout_conf(inode
, &conf
);
3044 CDEBUG(D_INODE
, "file: "DFID
" waiting layout return: %d.\n",
3045 PFID(&lli
->lli_fid
), rc
);
3051 * This function checks if there exists a LAYOUT lock on the client side,
3052 * or enqueues it if it doesn't have one in cache.
3054 * This function will not hold layout lock so it may be revoked any time after
3055 * this function returns. Any operations depend on layout should be redone
3058 * This function should be called before lov_io_init() to get an uptodate
3059 * layout version, the caller should save the version number and after IO
3060 * is finished, this function should be called again to verify that layout
3061 * is not changed during IO time.
3063 int ll_layout_refresh(struct inode
*inode
, __u32
*gen
)
3065 struct ll_inode_info
*lli
= ll_i2info(inode
);
3066 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
3067 struct md_op_data
*op_data
;
3068 struct lookup_intent it
;
3069 struct lustre_handle lockh
;
3071 struct ldlm_enqueue_info einfo
= {
3072 .ei_type
= LDLM_IBITS
,
3074 .ei_cb_bl
= ll_md_blocking_ast
,
3075 .ei_cb_cp
= ldlm_completion_ast
,
3079 *gen
= lli
->lli_layout_gen
;
3080 if (!(sbi
->ll_flags
& LL_SBI_LAYOUT_LOCK
))
3084 LASSERT(fid_is_sane(ll_inode2fid(inode
)));
3085 LASSERT(S_ISREG(inode
->i_mode
));
3087 /* mostly layout lock is caching on the local side, so try to match
3088 * it before grabbing layout lock mutex. */
3089 mode
= ll_take_md_lock(inode
, MDS_INODELOCK_LAYOUT
, &lockh
, 0);
3090 if (mode
!= 0) { /* hit cached lock */
3091 rc
= ll_layout_lock_set(&lockh
, mode
, inode
, gen
, false);
3095 /* better hold lli_layout_mutex to try again otherwise
3096 * it will have starvation problem. */
3099 /* take layout lock mutex to enqueue layout lock exclusively. */
3100 mutex_lock(&lli
->lli_layout_mutex
);
3103 /* try again. Maybe somebody else has done this. */
3104 mode
= ll_take_md_lock(inode
, MDS_INODELOCK_LAYOUT
, &lockh
, 0);
3105 if (mode
!= 0) { /* hit cached lock */
3106 rc
= ll_layout_lock_set(&lockh
, mode
, inode
, gen
, true);
3110 mutex_unlock(&lli
->lli_layout_mutex
);
3114 op_data
= ll_prep_md_op_data(NULL
, inode
, inode
, NULL
,
3115 0, 0, LUSTRE_OPC_ANY
, NULL
);
3116 if (IS_ERR(op_data
)) {
3117 mutex_unlock(&lli
->lli_layout_mutex
);
3118 return PTR_ERR(op_data
);
3121 /* have to enqueue one */
3122 memset(&it
, 0, sizeof(it
));
3123 it
.it_op
= IT_LAYOUT
;
3124 lockh
.cookie
= 0ULL;
3126 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID
".\n",
3127 ll_get_fsname(inode
->i_sb
, NULL
, 0), inode
,
3128 PFID(&lli
->lli_fid
));
3130 rc
= md_enqueue(sbi
->ll_md_exp
, &einfo
, &it
, op_data
, &lockh
,
3132 if (it
.d
.lustre
.it_data
!= NULL
)
3133 ptlrpc_req_finished(it
.d
.lustre
.it_data
);
3134 it
.d
.lustre
.it_data
= NULL
;
3136 ll_finish_md_op_data(op_data
);
3138 mode
= it
.d
.lustre
.it_lock_mode
;
3139 it
.d
.lustre
.it_lock_mode
= 0;
3140 ll_intent_drop_lock(&it
);
3143 /* set lock data in case this is a new lock */
3144 ll_set_lock_data(sbi
->ll_md_exp
, inode
, &it
, NULL
);
3145 rc
= ll_layout_lock_set(&lockh
, mode
, inode
, gen
, true);
3149 mutex_unlock(&lli
->lli_layout_mutex
);