1 // SPDX-License-Identifier: GPL-2.0
5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 only,
9 * as published by the Free Software Foundation.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License version 2 for more details (a copy is included
15 * in the LICENSE file that accompanied this code).
17 * You should have received a copy of the GNU General Public License
18 * version 2 along with this program; If not, see
19 * http://www.gnu.org/licenses/gpl-2.0.html
24 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Use is subject to license terms.
27 * Copyright (c) 2011, 2015, Intel Corporation.
30 * This file is part of Lustre, http://www.lustre.org/
31 * Lustre is a trademark of Sun Microsystems, Inc.
33 * lustre/llite/llite_lib.c
35 * Lustre Light Super operations
38 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <linux/module.h>
41 #include <linux/statfs.h>
42 #include <linux/types.h>
45 #include <uapi/linux/lustre/lustre_ioctl.h>
46 #include <lustre_ha.h>
47 #include <lustre_dlm.h>
48 #include <lprocfs_status.h>
49 #include <lustre_disk.h>
50 #include <uapi/linux/lustre/lustre_param.h>
51 #include <lustre_log.h>
52 #include <cl_object.h>
53 #include <obd_cksum.h>
54 #include "llite_internal.h"
56 struct kmem_cache
*ll_file_data_slab
;
57 struct dentry
*llite_root
;
58 struct kset
*llite_kset
;
61 #define log2(n) ffz(~(n))
64 static struct ll_sb_info
*ll_init_sbi(struct super_block
*sb
)
66 struct ll_sb_info
*sbi
= NULL
;
68 unsigned long lru_page_max
;
73 sbi
= kzalloc(sizeof(*sbi
), GFP_NOFS
);
77 spin_lock_init(&sbi
->ll_lock
);
78 mutex_init(&sbi
->ll_lco
.lco_lock
);
79 spin_lock_init(&sbi
->ll_pp_extent_lock
);
80 spin_lock_init(&sbi
->ll_process_lock
);
81 sbi
->ll_rw_stats_on
= 0;
84 pages
= si
.totalram
- si
.totalhigh
;
85 lru_page_max
= pages
/ 2;
87 sbi
->ll_cache
= cl_cache_init(lru_page_max
);
93 sbi
->ll_ra_info
.ra_max_pages_per_file
= min(pages
/ 32,
94 SBI_DEFAULT_READAHEAD_MAX
);
95 sbi
->ll_ra_info
.ra_max_pages
= sbi
->ll_ra_info
.ra_max_pages_per_file
;
96 sbi
->ll_ra_info
.ra_max_read_ahead_whole_pages
=
97 SBI_DEFAULT_READAHEAD_WHOLE_MAX
;
99 ll_generate_random_uuid(uuid
);
100 class_uuid_unparse(uuid
, &sbi
->ll_sb_uuid
);
101 CDEBUG(D_CONFIG
, "generated uuid: %s\n", sbi
->ll_sb_uuid
.uuid
);
103 sbi
->ll_flags
|= LL_SBI_VERBOSE
;
104 sbi
->ll_flags
|= LL_SBI_CHECKSUM
;
106 sbi
->ll_flags
|= LL_SBI_LRU_RESIZE
;
107 sbi
->ll_flags
|= LL_SBI_LAZYSTATFS
;
109 for (i
= 0; i
<= LL_PROCESS_HIST_MAX
; i
++) {
110 spin_lock_init(&sbi
->ll_rw_extents_info
.pp_extents
[i
].
112 spin_lock_init(&sbi
->ll_rw_extents_info
.pp_extents
[i
].
116 /* metadata statahead is enabled by default */
117 sbi
->ll_sa_max
= LL_SA_RPC_DEF
;
118 atomic_set(&sbi
->ll_sa_total
, 0);
119 atomic_set(&sbi
->ll_sa_wrong
, 0);
120 atomic_set(&sbi
->ll_sa_running
, 0);
121 atomic_set(&sbi
->ll_agl_total
, 0);
122 sbi
->ll_flags
|= LL_SBI_AGL_ENABLED
;
125 sbi
->ll_squash
.rsi_uid
= 0;
126 sbi
->ll_squash
.rsi_gid
= 0;
127 INIT_LIST_HEAD(&sbi
->ll_squash
.rsi_nosquash_nids
);
128 init_rwsem(&sbi
->ll_squash
.rsi_sem
);
135 static void ll_free_sbi(struct super_block
*sb
)
137 struct ll_sb_info
*sbi
= ll_s2sbi(sb
);
140 if (!list_empty(&sbi
->ll_squash
.rsi_nosquash_nids
))
141 cfs_free_nidlist(&sbi
->ll_squash
.rsi_nosquash_nids
);
142 cl_cache_decref(sbi
->ll_cache
);
143 sbi
->ll_cache
= NULL
;
149 static int client_common_fill_super(struct super_block
*sb
, char *md
, char *dt
,
150 struct vfsmount
*mnt
)
152 struct inode
*root
= NULL
;
153 struct ll_sb_info
*sbi
= ll_s2sbi(sb
);
154 struct obd_device
*obd
;
155 struct obd_statfs
*osfs
= NULL
;
156 struct ptlrpc_request
*request
= NULL
;
157 struct obd_connect_data
*data
= NULL
;
158 struct obd_uuid
*uuid
;
159 struct md_op_data
*op_data
;
160 struct lustre_md lmd
;
162 int size
, err
, checksum
;
164 obd
= class_name2obd(md
);
166 CERROR("MD %s: not setup or attached\n", md
);
170 data
= kzalloc(sizeof(*data
), GFP_NOFS
);
174 osfs
= kzalloc(sizeof(*osfs
), GFP_NOFS
);
180 /* indicate the features supported by this client */
181 data
->ocd_connect_flags
= OBD_CONNECT_IBITS
| OBD_CONNECT_NODEVOH
|
182 OBD_CONNECT_ATTRFID
|
183 OBD_CONNECT_VERSION
| OBD_CONNECT_BRW_SIZE
|
184 OBD_CONNECT_CANCELSET
| OBD_CONNECT_FID
|
185 OBD_CONNECT_AT
| OBD_CONNECT_LOV_V3
|
186 OBD_CONNECT_VBR
| OBD_CONNECT_FULL20
|
187 OBD_CONNECT_64BITHASH
|
188 OBD_CONNECT_EINPROGRESS
|
189 OBD_CONNECT_JOBSTATS
| OBD_CONNECT_LVB_TYPE
|
190 OBD_CONNECT_LAYOUTLOCK
|
191 OBD_CONNECT_PINGLESS
|
192 OBD_CONNECT_MAX_EASIZE
|
193 OBD_CONNECT_FLOCK_DEAD
|
194 OBD_CONNECT_DISP_STRIPE
| OBD_CONNECT_LFSCK
|
195 OBD_CONNECT_OPEN_BY_FID
|
196 OBD_CONNECT_DIR_STRIPE
|
197 OBD_CONNECT_BULK_MBITS
;
199 if (sbi
->ll_flags
& LL_SBI_LRU_RESIZE
)
200 data
->ocd_connect_flags
|= OBD_CONNECT_LRU_RESIZE
;
201 #ifdef CONFIG_FS_POSIX_ACL
202 data
->ocd_connect_flags
|= OBD_CONNECT_ACL
| OBD_CONNECT_UMASK
;
205 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT
))
206 /* flag mdc connection as lightweight, only used for test
207 * purpose, use with care
209 data
->ocd_connect_flags
|= OBD_CONNECT_LIGHTWEIGHT
;
211 data
->ocd_ibits_known
= MDS_INODELOCK_FULL
;
212 data
->ocd_version
= LUSTRE_VERSION_CODE
;
215 data
->ocd_connect_flags
|= OBD_CONNECT_RDONLY
;
216 if (sbi
->ll_flags
& LL_SBI_USER_XATTR
)
217 data
->ocd_connect_flags
|= OBD_CONNECT_XATTR
;
219 if (sbi
->ll_flags
& LL_SBI_FLOCK
)
220 sbi
->ll_fop
= &ll_file_operations_flock
;
221 else if (sbi
->ll_flags
& LL_SBI_LOCALFLOCK
)
222 sbi
->ll_fop
= &ll_file_operations
;
224 sbi
->ll_fop
= &ll_file_operations_noflock
;
226 /* always ping even if server suppress_pings */
227 if (sbi
->ll_flags
& LL_SBI_ALWAYS_PING
)
228 data
->ocd_connect_flags
&= ~OBD_CONNECT_PINGLESS
;
230 data
->ocd_brw_size
= MD_MAX_BRW_SIZE
;
232 err
= obd_connect(NULL
, &sbi
->ll_md_exp
, obd
, &sbi
->ll_sb_uuid
,
235 LCONSOLE_ERROR_MSG(0x14f,
236 "An MDT (md %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n",
240 CERROR("cannot connect to %s: rc = %d\n", md
, err
);
244 sbi
->ll_md_exp
->exp_connect_data
= *data
;
246 err
= obd_fid_init(sbi
->ll_md_exp
->exp_obd
, sbi
->ll_md_exp
,
247 LUSTRE_SEQ_METADATA
);
249 CERROR("%s: Can't init metadata layer FID infrastructure, rc = %d\n",
250 sbi
->ll_md_exp
->exp_obd
->obd_name
, err
);
254 /* For mount, we only need fs info from MDT0, and also in DNE, it
255 * can make sure the client can be mounted as long as MDT0 is
258 err
= obd_statfs(NULL
, sbi
->ll_md_exp
, osfs
,
259 cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS
),
260 OBD_STATFS_FOR_MDT0
);
264 /* This needs to be after statfs to ensure connect has finished.
265 * Note that "data" does NOT contain the valid connect reply.
266 * If connecting to a 1.8 server there will be no LMV device, so
267 * we can access the MDC export directly and exp_connect_flags will
268 * be non-zero, but if accessing an upgraded 2.1 server it will
269 * have the correct flags filled in.
270 * XXX: fill in the LMV exp_connect_flags from MDC(s).
272 valid
= exp_connect_flags(sbi
->ll_md_exp
) & CLIENT_CONNECT_MDT_REQD
;
273 if (exp_connect_flags(sbi
->ll_md_exp
) != 0 &&
274 valid
!= CLIENT_CONNECT_MDT_REQD
) {
277 buf
= kzalloc(PAGE_SIZE
, GFP_KERNEL
);
282 obd_connect_flags2str(buf
, PAGE_SIZE
,
283 valid
^ CLIENT_CONNECT_MDT_REQD
, ",");
284 LCONSOLE_ERROR_MSG(0x170,
285 "Server %s does not support feature(s) needed for correct operation of this client (%s). Please upgrade server or downgrade client.\n",
286 sbi
->ll_md_exp
->exp_obd
->obd_name
, buf
);
292 size
= sizeof(*data
);
293 err
= obd_get_info(NULL
, sbi
->ll_md_exp
, sizeof(KEY_CONN_DATA
),
294 KEY_CONN_DATA
, &size
, data
);
296 CERROR("%s: Get connect data failed: rc = %d\n",
297 sbi
->ll_md_exp
->exp_obd
->obd_name
, err
);
301 LASSERT(osfs
->os_bsize
);
302 sb
->s_blocksize
= osfs
->os_bsize
;
303 sb
->s_blocksize_bits
= log2(osfs
->os_bsize
);
304 sb
->s_magic
= LL_SUPER_MAGIC
;
305 sb
->s_maxbytes
= MAX_LFS_FILESIZE
;
306 sbi
->ll_namelen
= osfs
->os_namelen
;
307 sbi
->ll_mnt
.mnt
= current
->fs
->root
.mnt
;
309 if ((sbi
->ll_flags
& LL_SBI_USER_XATTR
) &&
310 !(data
->ocd_connect_flags
& OBD_CONNECT_XATTR
)) {
311 LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n");
312 sbi
->ll_flags
&= ~LL_SBI_USER_XATTR
;
315 if (data
->ocd_connect_flags
& OBD_CONNECT_ACL
) {
316 sb
->s_flags
|= MS_POSIXACL
;
317 sbi
->ll_flags
|= LL_SBI_ACL
;
319 LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
320 sb
->s_flags
&= ~MS_POSIXACL
;
321 sbi
->ll_flags
&= ~LL_SBI_ACL
;
324 if (data
->ocd_connect_flags
& OBD_CONNECT_64BITHASH
)
325 sbi
->ll_flags
|= LL_SBI_64BIT_HASH
;
327 if (data
->ocd_connect_flags
& OBD_CONNECT_BRW_SIZE
)
328 sbi
->ll_md_brw_pages
= data
->ocd_brw_size
>> PAGE_SHIFT
;
330 sbi
->ll_md_brw_pages
= 1;
332 if (data
->ocd_connect_flags
& OBD_CONNECT_LAYOUTLOCK
)
333 sbi
->ll_flags
|= LL_SBI_LAYOUT_LOCK
;
335 if (data
->ocd_ibits_known
& MDS_INODELOCK_XATTR
) {
336 if (!(data
->ocd_connect_flags
& OBD_CONNECT_MAX_EASIZE
)) {
338 "%s: disabling xattr cache due to unknown maximum xattr size.\n",
341 sbi
->ll_flags
|= LL_SBI_XATTR_CACHE
;
342 sbi
->ll_xattr_cache_enabled
= 1;
346 obd
= class_name2obd(dt
);
348 CERROR("DT %s: not setup or attached\n", dt
);
353 data
->ocd_connect_flags
= OBD_CONNECT_GRANT
| OBD_CONNECT_VERSION
|
354 OBD_CONNECT_REQPORTAL
| OBD_CONNECT_BRW_SIZE
|
355 OBD_CONNECT_CANCELSET
| OBD_CONNECT_FID
|
356 OBD_CONNECT_SRVLOCK
| OBD_CONNECT_TRUNCLOCK
|
357 OBD_CONNECT_AT
| OBD_CONNECT_OSS_CAPA
|
358 OBD_CONNECT_VBR
| OBD_CONNECT_FULL20
|
359 OBD_CONNECT_64BITHASH
| OBD_CONNECT_MAXBYTES
|
360 OBD_CONNECT_EINPROGRESS
|
361 OBD_CONNECT_JOBSTATS
| OBD_CONNECT_LVB_TYPE
|
362 OBD_CONNECT_LAYOUTLOCK
|
363 OBD_CONNECT_PINGLESS
| OBD_CONNECT_LFSCK
|
364 OBD_CONNECT_BULK_MBITS
;
366 if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM
)) {
367 /* OBD_CONNECT_CKSUM should always be set, even if checksums are
368 * disabled by default, because it can still be enabled on the
369 * fly via /sys. As a consequence, we still need to come to an
370 * agreement on the supported algorithms at connect time
372 data
->ocd_connect_flags
|= OBD_CONNECT_CKSUM
;
374 if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY
))
375 data
->ocd_cksum_types
= OBD_CKSUM_ADLER
;
377 data
->ocd_cksum_types
= cksum_types_supported_client();
380 data
->ocd_connect_flags
|= OBD_CONNECT_LRU_RESIZE
;
382 /* always ping even if server suppress_pings */
383 if (sbi
->ll_flags
& LL_SBI_ALWAYS_PING
)
384 data
->ocd_connect_flags
&= ~OBD_CONNECT_PINGLESS
;
387 "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d\n",
388 data
->ocd_connect_flags
,
389 data
->ocd_version
, data
->ocd_grant
);
391 obd
->obd_upcall
.onu_owner
= &sbi
->ll_lco
;
392 obd
->obd_upcall
.onu_upcall
= cl_ocd_update
;
394 data
->ocd_brw_size
= DT_MAX_BRW_SIZE
;
396 err
= obd_connect(NULL
, &sbi
->ll_dt_exp
, obd
, &sbi
->ll_sb_uuid
, data
,
399 LCONSOLE_ERROR_MSG(0x150,
400 "An OST (dt %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n",
404 CERROR("%s: Cannot connect to %s: rc = %d\n",
405 sbi
->ll_dt_exp
->exp_obd
->obd_name
, dt
, err
);
409 sbi
->ll_dt_exp
->exp_connect_data
= *data
;
411 err
= obd_fid_init(sbi
->ll_dt_exp
->exp_obd
, sbi
->ll_dt_exp
,
412 LUSTRE_SEQ_METADATA
);
414 CERROR("%s: Can't init data layer FID infrastructure, rc = %d\n",
415 sbi
->ll_dt_exp
->exp_obd
->obd_name
, err
);
419 mutex_lock(&sbi
->ll_lco
.lco_lock
);
420 sbi
->ll_lco
.lco_flags
= data
->ocd_connect_flags
;
421 sbi
->ll_lco
.lco_md_exp
= sbi
->ll_md_exp
;
422 sbi
->ll_lco
.lco_dt_exp
= sbi
->ll_dt_exp
;
423 mutex_unlock(&sbi
->ll_lco
.lco_lock
);
425 fid_zero(&sbi
->ll_root_fid
);
426 err
= md_getstatus(sbi
->ll_md_exp
, &sbi
->ll_root_fid
);
428 CERROR("cannot mds_connect: rc = %d\n", err
);
431 if (!fid_is_sane(&sbi
->ll_root_fid
)) {
432 CERROR("%s: Invalid root fid " DFID
" during mount\n",
433 sbi
->ll_md_exp
->exp_obd
->obd_name
,
434 PFID(&sbi
->ll_root_fid
));
438 CDEBUG(D_SUPER
, "rootfid " DFID
"\n", PFID(&sbi
->ll_root_fid
));
440 sb
->s_op
= &lustre_super_operations
;
441 sb
->s_xattr
= ll_xattr_handlers
;
442 #if THREAD_SIZE >= 8192 /*b=17630*/
443 sb
->s_export_op
= &lustre_export_operations
;
447 * XXX: move this to after cbd setup?
449 valid
= OBD_MD_FLGETATTR
| OBD_MD_FLBLOCKS
| OBD_MD_FLMODEASIZE
;
450 if (sbi
->ll_flags
& LL_SBI_ACL
)
451 valid
|= OBD_MD_FLACL
;
453 op_data
= kzalloc(sizeof(*op_data
), GFP_NOFS
);
459 op_data
->op_fid1
= sbi
->ll_root_fid
;
460 op_data
->op_mode
= 0;
461 op_data
->op_valid
= valid
;
463 err
= md_getattr(sbi
->ll_md_exp
, op_data
, &request
);
466 CERROR("%s: md_getattr failed for root: rc = %d\n",
467 sbi
->ll_md_exp
->exp_obd
->obd_name
, err
);
471 err
= md_get_lustre_md(sbi
->ll_md_exp
, request
, sbi
->ll_dt_exp
,
472 sbi
->ll_md_exp
, &lmd
);
474 CERROR("failed to understand root inode md: rc = %d\n", err
);
475 ptlrpc_req_finished(request
);
479 LASSERT(fid_is_sane(&sbi
->ll_root_fid
));
480 root
= ll_iget(sb
, cl_fid_build_ino(&sbi
->ll_root_fid
,
481 sbi
->ll_flags
& LL_SBI_32BIT_API
),
483 md_free_lustre_md(sbi
->ll_md_exp
, &lmd
);
484 ptlrpc_req_finished(request
);
487 #ifdef CONFIG_FS_POSIX_ACL
489 posix_acl_release(lmd
.posix_acl
);
490 lmd
.posix_acl
= NULL
;
494 CERROR("lustre_lite: bad iget4 for root\n");
498 checksum
= sbi
->ll_flags
& LL_SBI_CHECKSUM
;
499 err
= obd_set_info_async(NULL
, sbi
->ll_dt_exp
, sizeof(KEY_CHECKSUM
),
500 KEY_CHECKSUM
, sizeof(checksum
), &checksum
,
503 CERROR("%s: Set checksum failed: rc = %d\n",
504 sbi
->ll_dt_exp
->exp_obd
->obd_name
, err
);
509 err
= obd_set_info_async(NULL
, sbi
->ll_dt_exp
, sizeof(KEY_CACHE_SET
),
510 KEY_CACHE_SET
, sizeof(*sbi
->ll_cache
),
511 sbi
->ll_cache
, NULL
);
513 CERROR("%s: Set cache_set failed: rc = %d\n",
514 sbi
->ll_dt_exp
->exp_obd
->obd_name
, err
);
518 sb
->s_root
= d_make_root(root
);
520 CERROR("%s: can't make root dentry\n",
521 ll_get_fsname(sb
, NULL
, 0));
526 sbi
->ll_sdev_orig
= sb
->s_dev
;
528 /* We set sb->s_dev equal on all lustre clients in order to support
529 * NFS export clustering. NFSD requires that the FSID be the same
532 /* s_dev is also used in lt_compare() to compare two fs, but that is
533 * only a node-local comparison.
535 uuid
= obd_get_uuid(sbi
->ll_md_exp
);
537 sb
->s_dev
= get_uuid2int(uuid
->uuid
, strlen(uuid
->uuid
));
538 get_uuid2fsid(uuid
->uuid
, strlen(uuid
->uuid
), &sbi
->ll_fsid
);
545 err
= ldebugfs_register_mountpoint(llite_root
, sb
, dt
, md
);
547 CERROR("%s: could not register mount in debugfs: "
548 "rc = %d\n", ll_get_fsname(sb
, NULL
, 0), err
);
557 obd_fid_fini(sbi
->ll_dt_exp
->exp_obd
);
559 obd_disconnect(sbi
->ll_dt_exp
);
560 sbi
->ll_dt_exp
= NULL
;
562 obd_fid_fini(sbi
->ll_md_exp
->exp_obd
);
564 obd_disconnect(sbi
->ll_md_exp
);
565 sbi
->ll_md_exp
= NULL
;
572 int ll_get_max_mdsize(struct ll_sb_info
*sbi
, int *lmmsize
)
576 size
= sizeof(*lmmsize
);
577 rc
= obd_get_info(NULL
, sbi
->ll_dt_exp
, sizeof(KEY_MAX_EASIZE
),
578 KEY_MAX_EASIZE
, &size
, lmmsize
);
580 CERROR("%s: cannot get max LOV EA size: rc = %d\n",
581 sbi
->ll_dt_exp
->exp_obd
->obd_name
, rc
);
586 rc
= obd_get_info(NULL
, sbi
->ll_md_exp
, sizeof(KEY_MAX_EASIZE
),
587 KEY_MAX_EASIZE
, &size
, lmmsize
);
589 CERROR("Get max mdsize error rc %d\n", rc
);
595 * Get the value of the default_easize parameter.
597 * \see client_obd::cl_default_mds_easize
599 * \param[in] sbi superblock info for this filesystem
600 * \param[out] lmmsize pointer to storage location for value
602 * \retval 0 on success
603 * \retval negative negated errno on failure
605 int ll_get_default_mdsize(struct ll_sb_info
*sbi
, int *lmmsize
)
610 rc
= obd_get_info(NULL
, sbi
->ll_md_exp
, sizeof(KEY_DEFAULT_EASIZE
),
611 KEY_DEFAULT_EASIZE
, &size
, lmmsize
);
613 CERROR("Get default mdsize error rc %d\n", rc
);
619 * Set the default_easize parameter to the given value.
621 * \see client_obd::cl_default_mds_easize
623 * \param[in] sbi superblock info for this filesystem
624 * \param[in] lmmsize the size to set
626 * \retval 0 on success
627 * \retval negative negated errno on failure
629 int ll_set_default_mdsize(struct ll_sb_info
*sbi
, int lmmsize
)
631 if (lmmsize
< sizeof(struct lov_mds_md
) ||
632 lmmsize
> OBD_MAX_DEFAULT_EA_SIZE
)
635 return obd_set_info_async(NULL
, sbi
->ll_md_exp
,
636 sizeof(KEY_DEFAULT_EASIZE
),
638 sizeof(int), &lmmsize
, NULL
);
641 static void client_common_put_super(struct super_block
*sb
)
643 struct ll_sb_info
*sbi
= ll_s2sbi(sb
);
647 obd_fid_fini(sbi
->ll_dt_exp
->exp_obd
);
648 obd_disconnect(sbi
->ll_dt_exp
);
649 sbi
->ll_dt_exp
= NULL
;
651 ldebugfs_unregister_mountpoint(sbi
);
653 obd_fid_fini(sbi
->ll_md_exp
->exp_obd
);
654 obd_disconnect(sbi
->ll_md_exp
);
655 sbi
->ll_md_exp
= NULL
;
658 void ll_kill_super(struct super_block
*sb
)
660 struct ll_sb_info
*sbi
;
663 if (!(sb
->s_flags
& MS_ACTIVE
))
667 /* we need to restore s_dev from changed for clustered NFS before
668 * put_super because new kernels have cached s_dev and change sb->s_dev
669 * in put_super not affected real removing devices
672 sb
->s_dev
= sbi
->ll_sdev_orig
;
673 sbi
->ll_umounting
= 1;
675 /* wait running statahead threads to quit */
676 while (atomic_read(&sbi
->ll_sa_running
) > 0) {
677 set_current_state(TASK_UNINTERRUPTIBLE
);
678 schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC
>> 3));
683 static inline int ll_set_opt(const char *opt
, char *data
, int fl
)
685 if (strncmp(opt
, data
, strlen(opt
)) != 0)
691 /* non-client-specific mount options are parsed in lmd_parse */
692 static int ll_options(char *options
, int *flags
)
695 char *s1
= options
, *s2
;
700 CDEBUG(D_CONFIG
, "Parsing opts %s\n", options
);
703 CDEBUG(D_SUPER
, "next opt=%s\n", s1
);
704 tmp
= ll_set_opt("nolock", s1
, LL_SBI_NOLCK
);
709 tmp
= ll_set_opt("flock", s1
, LL_SBI_FLOCK
);
714 tmp
= ll_set_opt("localflock", s1
, LL_SBI_LOCALFLOCK
);
719 tmp
= ll_set_opt("noflock", s1
,
720 LL_SBI_FLOCK
| LL_SBI_LOCALFLOCK
);
725 tmp
= ll_set_opt("user_xattr", s1
, LL_SBI_USER_XATTR
);
730 tmp
= ll_set_opt("nouser_xattr", s1
, LL_SBI_USER_XATTR
);
735 tmp
= ll_set_opt("context", s1
, 1);
738 tmp
= ll_set_opt("fscontext", s1
, 1);
741 tmp
= ll_set_opt("defcontext", s1
, 1);
744 tmp
= ll_set_opt("rootcontext", s1
, 1);
747 tmp
= ll_set_opt("user_fid2path", s1
, LL_SBI_USER_FID2PATH
);
752 tmp
= ll_set_opt("nouser_fid2path", s1
, LL_SBI_USER_FID2PATH
);
758 tmp
= ll_set_opt("checksum", s1
, LL_SBI_CHECKSUM
);
763 tmp
= ll_set_opt("nochecksum", s1
, LL_SBI_CHECKSUM
);
768 tmp
= ll_set_opt("lruresize", s1
, LL_SBI_LRU_RESIZE
);
773 tmp
= ll_set_opt("nolruresize", s1
, LL_SBI_LRU_RESIZE
);
778 tmp
= ll_set_opt("lazystatfs", s1
, LL_SBI_LAZYSTATFS
);
783 tmp
= ll_set_opt("nolazystatfs", s1
, LL_SBI_LAZYSTATFS
);
788 tmp
= ll_set_opt("32bitapi", s1
, LL_SBI_32BIT_API
);
793 tmp
= ll_set_opt("verbose", s1
, LL_SBI_VERBOSE
);
798 tmp
= ll_set_opt("noverbose", s1
, LL_SBI_VERBOSE
);
803 tmp
= ll_set_opt("always_ping", s1
, LL_SBI_ALWAYS_PING
);
808 LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
814 s2
= strchr(s1
, ',');
822 void ll_lli_init(struct ll_inode_info
*lli
)
824 lli
->lli_inode_magic
= LLI_INODE_MAGIC
;
826 spin_lock_init(&lli
->lli_lock
);
827 lli
->lli_posix_acl
= NULL
;
828 /* Do not set lli_fid, it has been initialized already. */
829 fid_zero(&lli
->lli_pfid
);
830 lli
->lli_mds_read_och
= NULL
;
831 lli
->lli_mds_write_och
= NULL
;
832 lli
->lli_mds_exec_och
= NULL
;
833 lli
->lli_open_fd_read_count
= 0;
834 lli
->lli_open_fd_write_count
= 0;
835 lli
->lli_open_fd_exec_count
= 0;
836 mutex_init(&lli
->lli_och_mutex
);
837 spin_lock_init(&lli
->lli_agl_lock
);
838 spin_lock_init(&lli
->lli_layout_lock
);
839 ll_layout_version_set(lli
, CL_LAYOUT_GEN_NONE
);
840 lli
->lli_clob
= NULL
;
842 init_rwsem(&lli
->lli_xattrs_list_rwsem
);
843 mutex_init(&lli
->lli_xattrs_enq_lock
);
845 LASSERT(lli
->lli_vfs_inode
.i_mode
!= 0);
846 if (S_ISDIR(lli
->lli_vfs_inode
.i_mode
)) {
847 mutex_init(&lli
->lli_readdir_mutex
);
848 lli
->lli_opendir_key
= NULL
;
850 spin_lock_init(&lli
->lli_sa_lock
);
851 lli
->lli_opendir_pid
= 0;
852 lli
->lli_sa_enabled
= 0;
853 lli
->lli_def_stripe_offset
= -1;
855 mutex_init(&lli
->lli_size_mutex
);
856 lli
->lli_symlink_name
= NULL
;
857 init_rwsem(&lli
->lli_trunc_sem
);
858 range_lock_tree_init(&lli
->lli_write_tree
);
859 init_rwsem(&lli
->lli_glimpse_sem
);
860 lli
->lli_glimpse_time
= 0;
861 INIT_LIST_HEAD(&lli
->lli_agl_list
);
862 lli
->lli_agl_index
= 0;
863 lli
->lli_async_rc
= 0;
865 mutex_init(&lli
->lli_layout_mutex
);
868 int ll_fill_super(struct super_block
*sb
, struct vfsmount
*mnt
)
870 struct lustre_profile
*lprof
= NULL
;
871 struct lustre_sb_info
*lsi
= s2lsi(sb
);
872 struct ll_sb_info
*sbi
;
873 char *dt
= NULL
, *md
= NULL
;
874 char *profilenm
= get_profile_name(sb
);
875 struct config_llog_instance
*cfg
;
877 static atomic_t ll_bdi_num
= ATOMIC_INIT(0);
879 CDEBUG(D_VFSTRACE
, "VFS Op: sb %p\n", sb
);
881 cfg
= kzalloc(sizeof(*cfg
), GFP_NOFS
);
885 try_module_get(THIS_MODULE
);
887 /* client additional sb info */
888 sbi
= ll_init_sbi(sb
);
889 lsi
->lsi_llsbi
= sbi
;
891 module_put(THIS_MODULE
);
896 err
= ll_options(lsi
->lsi_lmd
->lmd_opts
, &sbi
->ll_flags
);
900 err
= super_setup_bdi_name(sb
, "lustre-%d",
901 atomic_inc_return(&ll_bdi_num
));
905 /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
906 sb
->s_d_op
= &ll_d_ops
;
908 /* Generate a string unique to this super, in case some joker tries
909 * to mount the same fs at two mount points.
910 * Use the address of the super itself.
912 cfg
->cfg_instance
= sb
;
913 cfg
->cfg_uuid
= lsi
->lsi_llsbi
->ll_sb_uuid
;
914 cfg
->cfg_callback
= class_config_llog_handler
;
915 /* set up client obds */
916 err
= lustre_process_log(sb
, profilenm
, cfg
);
920 /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
921 lprof
= class_get_profile(profilenm
);
923 LCONSOLE_ERROR_MSG(0x156,
924 "The client profile '%s' could not be read from the MGS. Does that filesystem exist?\n",
929 CDEBUG(D_CONFIG
, "Found profile %s: mdc=%s osc=%s\n", profilenm
,
930 lprof
->lp_md
, lprof
->lp_dt
);
932 dt
= kasprintf(GFP_NOFS
, "%s-%p", lprof
->lp_dt
, cfg
->cfg_instance
);
938 md
= kasprintf(GFP_NOFS
, "%s-%p", lprof
->lp_md
, cfg
->cfg_instance
);
944 /* connections, registrations, sb setup */
945 err
= client_common_fill_super(sb
, md
, dt
, mnt
);
947 sbi
->ll_client_common_fill_super_succeeded
= 1;
953 class_put_profile(lprof
);
956 else if (sbi
->ll_flags
& LL_SBI_VERBOSE
)
957 LCONSOLE_WARN("Mounted %s\n", profilenm
);
961 } /* ll_fill_super */
963 void ll_put_super(struct super_block
*sb
)
965 struct config_llog_instance cfg
, params_cfg
;
966 struct obd_device
*obd
;
967 struct lustre_sb_info
*lsi
= s2lsi(sb
);
968 struct ll_sb_info
*sbi
= ll_s2sbi(sb
);
969 char *profilenm
= get_profile_name(sb
);
970 int next
, force
= 1, rc
= 0;
973 CDEBUG(D_VFSTRACE
, "VFS Op: sb %p - %s\n", sb
, profilenm
);
975 cfg
.cfg_instance
= sb
;
976 lustre_end_log(sb
, profilenm
, &cfg
);
978 params_cfg
.cfg_instance
= sb
;
979 lustre_end_log(sb
, PARAMS_FILENAME
, ¶ms_cfg
);
981 if (sbi
->ll_md_exp
) {
982 obd
= class_exp2obd(sbi
->ll_md_exp
);
984 force
= obd
->obd_force
;
987 /* Wait for unstable pages to be committed to stable storage */
989 struct l_wait_info lwi
= LWI_INTR(LWI_ON_SIGNAL_NOOP
, NULL
);
991 rc
= l_wait_event(sbi
->ll_cache
->ccc_unstable_waitq
,
992 !atomic_long_read(&sbi
->ll_cache
->ccc_unstable_nr
),
996 ccc_count
= atomic_long_read(&sbi
->ll_cache
->ccc_unstable_nr
);
997 if (!force
&& rc
!= -EINTR
)
998 LASSERTF(!ccc_count
, "count: %li\n", ccc_count
);
1000 /* We need to set force before the lov_disconnect in
1001 * lustre_common_put_super, since l_d cleans up osc's as well.
1005 while ((obd
= class_devices_in_group(&sbi
->ll_sb_uuid
,
1007 obd
->obd_force
= force
;
1011 if (sbi
->ll_client_common_fill_super_succeeded
) {
1012 /* Only if client_common_fill_super succeeded */
1013 client_common_put_super(sb
);
1017 while ((obd
= class_devices_in_group(&sbi
->ll_sb_uuid
, &next
)))
1018 class_manual_cleanup(obd
);
1020 if (sbi
->ll_flags
& LL_SBI_VERBOSE
)
1021 LCONSOLE_WARN("Unmounted %s\n", profilenm
? profilenm
: "");
1024 class_del_profile(profilenm
);
1027 lsi
->lsi_llsbi
= NULL
;
1029 lustre_common_put_super(sb
);
1031 cl_env_cache_purge(~0);
1033 module_put(THIS_MODULE
);
1034 } /* client_put_super */
1036 struct inode
*ll_inode_from_resource_lock(struct ldlm_lock
*lock
)
1038 struct inode
*inode
= NULL
;
1040 /* NOTE: we depend on atomic igrab() -bzzz */
1041 lock_res_and_lock(lock
);
1042 if (lock
->l_resource
->lr_lvb_inode
) {
1043 struct ll_inode_info
*lli
;
1045 lli
= ll_i2info(lock
->l_resource
->lr_lvb_inode
);
1046 if (lli
->lli_inode_magic
== LLI_INODE_MAGIC
) {
1047 inode
= igrab(lock
->l_resource
->lr_lvb_inode
);
1049 inode
= lock
->l_resource
->lr_lvb_inode
;
1050 LDLM_DEBUG_LIMIT(inode
->i_state
& I_FREEING
? D_INFO
:
1052 "lr_lvb_inode %p is bogus: magic %08x",
1053 lock
->l_resource
->lr_lvb_inode
,
1054 lli
->lli_inode_magic
);
1058 unlock_res_and_lock(lock
);
1062 void ll_dir_clear_lsm_md(struct inode
*inode
)
1064 struct ll_inode_info
*lli
= ll_i2info(inode
);
1066 LASSERT(S_ISDIR(inode
->i_mode
));
1068 if (lli
->lli_lsm_md
) {
1069 lmv_free_memmd(lli
->lli_lsm_md
);
1070 lli
->lli_lsm_md
= NULL
;
1074 static struct inode
*ll_iget_anon_dir(struct super_block
*sb
,
1075 const struct lu_fid
*fid
,
1076 struct lustre_md
*md
)
1078 struct ll_sb_info
*sbi
= ll_s2sbi(sb
);
1079 struct mdt_body
*body
= md
->body
;
1080 struct inode
*inode
;
1083 ino
= cl_fid_build_ino(fid
, sbi
->ll_flags
& LL_SBI_32BIT_API
);
1084 inode
= iget_locked(sb
, ino
);
1086 CERROR("%s: failed get simple inode " DFID
": rc = -ENOENT\n",
1087 ll_get_fsname(sb
, NULL
, 0), PFID(fid
));
1088 return ERR_PTR(-ENOENT
);
1091 if (inode
->i_state
& I_NEW
) {
1092 struct ll_inode_info
*lli
= ll_i2info(inode
);
1093 struct lmv_stripe_md
*lsm
= md
->lmv
;
1095 inode
->i_mode
= (inode
->i_mode
& ~S_IFMT
) |
1096 (body
->mbo_mode
& S_IFMT
);
1097 LASSERTF(S_ISDIR(inode
->i_mode
), "Not slave inode " DFID
"\n",
1100 LTIME_S(inode
->i_mtime
) = 0;
1101 LTIME_S(inode
->i_atime
) = 0;
1102 LTIME_S(inode
->i_ctime
) = 0;
1105 inode
->i_op
= &ll_dir_inode_operations
;
1106 inode
->i_fop
= &ll_dir_operations
;
1107 lli
->lli_fid
= *fid
;
1111 /* master object FID */
1112 lli
->lli_pfid
= body
->mbo_fid1
;
1113 CDEBUG(D_INODE
, "lli %p slave " DFID
" master " DFID
"\n",
1114 lli
, PFID(fid
), PFID(&lli
->lli_pfid
));
1115 unlock_new_inode(inode
);
1121 static int ll_init_lsm_md(struct inode
*inode
, struct lustre_md
*md
)
1123 struct lmv_stripe_md
*lsm
= md
->lmv
;
1129 * XXX sigh, this lsm_root initialization should be in
1130 * LMV layer, but it needs ll_iget right now, so we
1131 * put this here right now.
1133 for (i
= 0; i
< lsm
->lsm_md_stripe_count
; i
++) {
1134 fid
= &lsm
->lsm_md_oinfo
[i
].lmo_fid
;
1135 LASSERT(!lsm
->lsm_md_oinfo
[i
].lmo_root
);
1136 /* Unfortunately ll_iget will call ll_update_inode,
1137 * where the initialization of slave inode is slightly
1138 * different, so it reset lsm_md to NULL to avoid
1139 * initializing lsm for slave inode.
1141 /* For migrating inode, master stripe and master object will
1142 * be same, so we only need assign this inode
1144 if (lsm
->lsm_md_hash_type
& LMV_HASH_FLAG_MIGRATION
&& !i
)
1145 lsm
->lsm_md_oinfo
[i
].lmo_root
= inode
;
1147 lsm
->lsm_md_oinfo
[i
].lmo_root
=
1148 ll_iget_anon_dir(inode
->i_sb
, fid
, md
);
1149 if (IS_ERR(lsm
->lsm_md_oinfo
[i
].lmo_root
)) {
1150 int rc
= PTR_ERR(lsm
->lsm_md_oinfo
[i
].lmo_root
);
1152 lsm
->lsm_md_oinfo
[i
].lmo_root
= NULL
;
1160 static inline int lli_lsm_md_eq(const struct lmv_stripe_md
*lsm_md1
,
1161 const struct lmv_stripe_md
*lsm_md2
)
1163 return lsm_md1
->lsm_md_magic
== lsm_md2
->lsm_md_magic
&&
1164 lsm_md1
->lsm_md_stripe_count
== lsm_md2
->lsm_md_stripe_count
&&
1165 lsm_md1
->lsm_md_master_mdt_index
==
1166 lsm_md2
->lsm_md_master_mdt_index
&&
1167 lsm_md1
->lsm_md_hash_type
== lsm_md2
->lsm_md_hash_type
&&
1168 lsm_md1
->lsm_md_layout_version
==
1169 lsm_md2
->lsm_md_layout_version
&&
1170 !strcmp(lsm_md1
->lsm_md_pool_name
,
1171 lsm_md2
->lsm_md_pool_name
);
1174 static int ll_update_lsm_md(struct inode
*inode
, struct lustre_md
*md
)
1176 struct ll_inode_info
*lli
= ll_i2info(inode
);
1177 struct lmv_stripe_md
*lsm
= md
->lmv
;
1180 LASSERT(S_ISDIR(inode
->i_mode
));
1181 CDEBUG(D_INODE
, "update lsm %p of " DFID
"\n", lli
->lli_lsm_md
,
1182 PFID(ll_inode2fid(inode
)));
1184 /* no striped information from request. */
1186 if (!lli
->lli_lsm_md
) {
1188 } else if (lli
->lli_lsm_md
->lsm_md_hash_type
&
1189 LMV_HASH_FLAG_MIGRATION
) {
1191 * migration is done, the temporay MIGRATE layout has
1194 CDEBUG(D_INODE
, DFID
" finish migration.\n",
1195 PFID(ll_inode2fid(inode
)));
1196 lmv_free_memmd(lli
->lli_lsm_md
);
1197 lli
->lli_lsm_md
= NULL
;
1201 * The lustre_md from req does not include stripeEA,
1208 /* set the directory layout */
1209 if (!lli
->lli_lsm_md
) {
1210 struct cl_attr
*attr
;
1212 rc
= ll_init_lsm_md(inode
, md
);
1217 * set lsm_md to NULL, so the following free lustre_md
1218 * will not free this lsm
1221 lli
->lli_lsm_md
= lsm
;
1223 attr
= kzalloc(sizeof(*attr
), GFP_NOFS
);
1227 /* validate the lsm */
1228 rc
= md_merge_attr(ll_i2mdexp(inode
), lsm
, attr
,
1229 ll_md_blocking_ast
);
1235 if (md
->body
->mbo_valid
& OBD_MD_FLNLINK
)
1236 md
->body
->mbo_nlink
= attr
->cat_nlink
;
1237 if (md
->body
->mbo_valid
& OBD_MD_FLSIZE
)
1238 md
->body
->mbo_size
= attr
->cat_size
;
1239 if (md
->body
->mbo_valid
& OBD_MD_FLATIME
)
1240 md
->body
->mbo_atime
= attr
->cat_atime
;
1241 if (md
->body
->mbo_valid
& OBD_MD_FLCTIME
)
1242 md
->body
->mbo_ctime
= attr
->cat_ctime
;
1243 if (md
->body
->mbo_valid
& OBD_MD_FLMTIME
)
1244 md
->body
->mbo_mtime
= attr
->cat_mtime
;
1248 CDEBUG(D_INODE
, "Set lsm %p magic %x to " DFID
"\n", lsm
,
1249 lsm
->lsm_md_magic
, PFID(ll_inode2fid(inode
)));
1253 /* Compare the old and new stripe information */
1254 if (!lsm_md_eq(lli
->lli_lsm_md
, lsm
)) {
1255 struct lmv_stripe_md
*old_lsm
= lli
->lli_lsm_md
;
1258 CERROR("%s: inode " DFID
"(%p)'s lmv layout mismatch (%p)/(%p) magic:0x%x/0x%x stripe count: %d/%d master_mdt: %d/%d hash_type:0x%x/0x%x layout: 0x%x/0x%x pool:%s/%s\n",
1259 ll_get_fsname(inode
->i_sb
, NULL
, 0), PFID(&lli
->lli_fid
),
1260 inode
, lsm
, old_lsm
,
1261 lsm
->lsm_md_magic
, old_lsm
->lsm_md_magic
,
1262 lsm
->lsm_md_stripe_count
,
1263 old_lsm
->lsm_md_stripe_count
,
1264 lsm
->lsm_md_master_mdt_index
,
1265 old_lsm
->lsm_md_master_mdt_index
,
1266 lsm
->lsm_md_hash_type
, old_lsm
->lsm_md_hash_type
,
1267 lsm
->lsm_md_layout_version
,
1268 old_lsm
->lsm_md_layout_version
,
1269 lsm
->lsm_md_pool_name
,
1270 old_lsm
->lsm_md_pool_name
);
1272 for (idx
= 0; idx
< old_lsm
->lsm_md_stripe_count
; idx
++) {
1273 CERROR("%s: sub FIDs in old lsm idx %d, old: " DFID
"\n",
1274 ll_get_fsname(inode
->i_sb
, NULL
, 0), idx
,
1275 PFID(&old_lsm
->lsm_md_oinfo
[idx
].lmo_fid
));
1278 for (idx
= 0; idx
< lsm
->lsm_md_stripe_count
; idx
++) {
1279 CERROR("%s: sub FIDs in new lsm idx %d, new: " DFID
"\n",
1280 ll_get_fsname(inode
->i_sb
, NULL
, 0), idx
,
1281 PFID(&lsm
->lsm_md_oinfo
[idx
].lmo_fid
));
1290 void ll_clear_inode(struct inode
*inode
)
1292 struct ll_inode_info
*lli
= ll_i2info(inode
);
1293 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
1295 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p)\n",
1296 PFID(ll_inode2fid(inode
)), inode
);
1298 if (S_ISDIR(inode
->i_mode
)) {
1299 /* these should have been cleared in ll_file_release */
1300 LASSERT(!lli
->lli_opendir_key
);
1301 LASSERT(!lli
->lli_sai
);
1302 LASSERT(lli
->lli_opendir_pid
== 0);
1305 md_null_inode(sbi
->ll_md_exp
, ll_inode2fid(inode
));
1307 LASSERT(!lli
->lli_open_fd_write_count
);
1308 LASSERT(!lli
->lli_open_fd_read_count
);
1309 LASSERT(!lli
->lli_open_fd_exec_count
);
1311 if (lli
->lli_mds_write_och
)
1312 ll_md_real_close(inode
, FMODE_WRITE
);
1313 if (lli
->lli_mds_exec_och
)
1314 ll_md_real_close(inode
, FMODE_EXEC
);
1315 if (lli
->lli_mds_read_och
)
1316 ll_md_real_close(inode
, FMODE_READ
);
1318 if (S_ISLNK(inode
->i_mode
)) {
1319 kfree(lli
->lli_symlink_name
);
1320 lli
->lli_symlink_name
= NULL
;
1323 ll_xattr_cache_destroy(inode
);
1325 #ifdef CONFIG_FS_POSIX_ACL
1326 forget_all_cached_acls(inode
);
1327 if (lli
->lli_posix_acl
) {
1328 posix_acl_release(lli
->lli_posix_acl
);
1329 lli
->lli_posix_acl
= NULL
;
1332 lli
->lli_inode_magic
= LLI_INODE_DEAD
;
1334 if (S_ISDIR(inode
->i_mode
))
1335 ll_dir_clear_lsm_md(inode
);
1336 if (S_ISREG(inode
->i_mode
) && !is_bad_inode(inode
))
1337 LASSERT(list_empty(&lli
->lli_agl_list
));
1340 * XXX This has to be done before lsm is freed below, because
1341 * cl_object still uses inode lsm.
1343 cl_inode_fini(inode
);
1346 #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
1348 static int ll_md_setattr(struct dentry
*dentry
, struct md_op_data
*op_data
)
1350 struct lustre_md md
;
1351 struct inode
*inode
= d_inode(dentry
);
1352 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
1353 struct ptlrpc_request
*request
= NULL
;
1356 op_data
= ll_prep_md_op_data(op_data
, inode
, NULL
, NULL
, 0, 0,
1357 LUSTRE_OPC_ANY
, NULL
);
1358 if (IS_ERR(op_data
))
1359 return PTR_ERR(op_data
);
1361 rc
= md_setattr(sbi
->ll_md_exp
, op_data
, NULL
, 0, &request
);
1363 ptlrpc_req_finished(request
);
1364 if (rc
== -ENOENT
) {
1366 /* Unlinked special device node? Or just a race?
1367 * Pretend we did everything.
1369 if (!S_ISREG(inode
->i_mode
) &&
1370 !S_ISDIR(inode
->i_mode
)) {
1371 ia_valid
= op_data
->op_attr
.ia_valid
;
1372 op_data
->op_attr
.ia_valid
&= ~TIMES_SET_FLAGS
;
1373 rc
= simple_setattr(dentry
, &op_data
->op_attr
);
1374 op_data
->op_attr
.ia_valid
= ia_valid
;
1376 } else if (rc
!= -EPERM
&& rc
!= -EACCES
&& rc
!= -ETXTBSY
) {
1377 CERROR("md_setattr fails: rc = %d\n", rc
);
1382 rc
= md_get_lustre_md(sbi
->ll_md_exp
, request
, sbi
->ll_dt_exp
,
1383 sbi
->ll_md_exp
, &md
);
1385 ptlrpc_req_finished(request
);
1389 ia_valid
= op_data
->op_attr
.ia_valid
;
1390 /* inode size will be in cl_setattr_ost, can't do it now since dirty
1391 * cache is not cleared yet.
1393 op_data
->op_attr
.ia_valid
&= ~(TIMES_SET_FLAGS
| ATTR_SIZE
);
1394 if (S_ISREG(inode
->i_mode
))
1396 rc
= simple_setattr(dentry
, &op_data
->op_attr
);
1397 if (S_ISREG(inode
->i_mode
))
1398 inode_unlock(inode
);
1399 op_data
->op_attr
.ia_valid
= ia_valid
;
1401 rc
= ll_update_inode(inode
, &md
);
1402 ptlrpc_req_finished(request
);
1407 /* If this inode has objects allocated to it (lsm != NULL), then the OST
1408 * object(s) determine the file size and mtime. Otherwise, the MDS will
1409 * keep these values until such a time that objects are allocated for it.
1410 * We do the MDS operations first, as it is checking permissions for us.
1411 * We don't to the MDS RPC if there is nothing that we want to store there,
1412 * otherwise there is no harm in updating mtime/atime on the MDS if we are
1413 * going to do an RPC anyways.
1415 * If we are doing a truncate, we will send the mtime and ctime updates
1416 * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
1417 * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
1420 * In case of HSMimport, we only set attr on MDS.
1422 int ll_setattr_raw(struct dentry
*dentry
, struct iattr
*attr
, bool hsm_import
)
1424 struct inode
*inode
= d_inode(dentry
);
1425 struct ll_inode_info
*lli
= ll_i2info(inode
);
1426 struct md_op_data
*op_data
= NULL
;
1429 CDEBUG(D_VFSTRACE
, "%s: setattr inode " DFID
"(%p) from %llu to %llu, valid %x, hsm_import %d\n",
1430 ll_get_fsname(inode
->i_sb
, NULL
, 0), PFID(&lli
->lli_fid
), inode
,
1431 i_size_read(inode
), attr
->ia_size
, attr
->ia_valid
, hsm_import
);
1433 if (attr
->ia_valid
& ATTR_SIZE
) {
1434 /* Check new size against VFS/VM file size limit and rlimit */
1435 rc
= inode_newsize_ok(inode
, attr
->ia_size
);
1439 /* The maximum Lustre file size is variable, based on the
1440 * OST maximum object size and number of stripes. This
1441 * needs another check in addition to the VFS check above.
1443 if (attr
->ia_size
> ll_file_maxbytes(inode
)) {
1444 CDEBUG(D_INODE
, "file " DFID
" too large %llu > %llu\n",
1445 PFID(&lli
->lli_fid
), attr
->ia_size
,
1446 ll_file_maxbytes(inode
));
1450 attr
->ia_valid
|= ATTR_MTIME
| ATTR_CTIME
;
1453 /* POSIX: check before ATTR_*TIME_SET set (from setattr_prepare) */
1454 if (attr
->ia_valid
& TIMES_SET_FLAGS
) {
1455 if ((!uid_eq(current_fsuid(), inode
->i_uid
)) &&
1456 !capable(CFS_CAP_FOWNER
))
1460 /* We mark all of the fields "set" so MDS/OST does not re-set them */
1461 if (attr
->ia_valid
& ATTR_CTIME
) {
1462 attr
->ia_ctime
= current_time(inode
);
1463 attr
->ia_valid
|= ATTR_CTIME_SET
;
1465 if (!(attr
->ia_valid
& ATTR_ATIME_SET
) &&
1466 (attr
->ia_valid
& ATTR_ATIME
)) {
1467 attr
->ia_atime
= current_time(inode
);
1468 attr
->ia_valid
|= ATTR_ATIME_SET
;
1470 if (!(attr
->ia_valid
& ATTR_MTIME_SET
) &&
1471 (attr
->ia_valid
& ATTR_MTIME
)) {
1472 attr
->ia_mtime
= current_time(inode
);
1473 attr
->ia_valid
|= ATTR_MTIME_SET
;
1476 if (attr
->ia_valid
& (ATTR_MTIME
| ATTR_CTIME
))
1477 CDEBUG(D_INODE
, "setting mtime %lu, ctime %lu, now = %llu\n",
1478 LTIME_S(attr
->ia_mtime
), LTIME_S(attr
->ia_ctime
),
1479 (s64
)ktime_get_real_seconds());
1481 if (S_ISREG(inode
->i_mode
))
1482 inode_unlock(inode
);
1485 * We always do an MDS RPC, even if we're only changing the size;
1486 * only the MDS knows whether truncate() should fail with -ETXTBUSY
1488 op_data
= kzalloc(sizeof(*op_data
), GFP_NOFS
);
1494 if (!hsm_import
&& attr
->ia_valid
& ATTR_SIZE
) {
1496 * If we are changing file size, file content is
1497 * modified, flag it.
1499 attr
->ia_valid
|= MDS_OPEN_OWNEROVERRIDE
;
1500 op_data
->op_bias
|= MDS_DATA_MODIFIED
;
1501 clear_bit(LLIF_DATA_MODIFIED
, &lli
->lli_flags
);
1504 op_data
->op_attr
= *attr
;
1506 rc
= ll_md_setattr(dentry
, op_data
);
1510 if (!S_ISREG(inode
->i_mode
) || hsm_import
) {
1515 if (attr
->ia_valid
& (ATTR_SIZE
|
1516 ATTR_ATIME
| ATTR_ATIME_SET
|
1517 ATTR_MTIME
| ATTR_MTIME_SET
)) {
1518 /* For truncate and utimes sending attributes to OSTs, setting
1519 * mtime/atime to the past will be performed under PW [0:EOF]
1520 * extent lock (new_size:EOF for truncate). It may seem
1521 * excessive to send mtime/atime updates to OSTs when not
1522 * setting times to past, but it is necessary due to possible
1523 * time de-synchronization between MDT inode and OST objects
1525 rc
= cl_setattr_ost(ll_i2info(inode
)->lli_clob
, attr
, 0);
1529 * If the file was restored, it needs to set dirty flag.
1531 * We've already sent MDS_DATA_MODIFIED flag in
1532 * ll_md_setattr() for truncate. However, the MDT refuses to
1533 * set the HS_DIRTY flag on released files, so we have to set
1534 * it again if the file has been restored. Please check how
1535 * LLIF_DATA_MODIFIED is set in vvp_io_setattr_fini().
1537 * Please notice that if the file is not released, the previous
1538 * MDS_DATA_MODIFIED has taken effect and usually
1539 * LLIF_DATA_MODIFIED is not set(see vvp_io_setattr_fini()).
1540 * This way we can save an RPC for common open + trunc
1543 if (test_and_clear_bit(LLIF_DATA_MODIFIED
, &lli
->lli_flags
)) {
1544 struct hsm_state_set hss
= {
1545 .hss_valid
= HSS_SETMASK
,
1546 .hss_setmask
= HS_DIRTY
,
1550 rc2
= ll_hsm_state_set(inode
, &hss
);
1552 * truncate and write can happen at the same time, so that
1553 * the file can be set modified even though the file is not
1554 * restored from released state, and ll_hsm_state_set() is
1555 * not applicable for the file, and rc2 < 0 is normal in this
1559 CDEBUG(D_INFO
, DFID
"HSM set dirty failed: rc2 = %d\n",
1560 PFID(ll_inode2fid(inode
)), rc2
);
1565 ll_finish_md_op_data(op_data
);
1567 if (S_ISREG(inode
->i_mode
)) {
1569 if ((attr
->ia_valid
& ATTR_SIZE
) && !hsm_import
)
1570 inode_dio_wait(inode
);
1573 ll_stats_ops_tally(ll_i2sbi(inode
), (attr
->ia_valid
& ATTR_SIZE
) ?
1574 LPROC_LL_TRUNC
: LPROC_LL_SETATTR
, 1);
1579 int ll_setattr(struct dentry
*de
, struct iattr
*attr
)
1581 int mode
= d_inode(de
)->i_mode
;
1583 if ((attr
->ia_valid
& (ATTR_CTIME
| ATTR_SIZE
| ATTR_MODE
)) ==
1584 (ATTR_CTIME
| ATTR_SIZE
| ATTR_MODE
))
1585 attr
->ia_valid
|= MDS_OPEN_OWNEROVERRIDE
;
1587 if (((attr
->ia_valid
& (ATTR_MODE
| ATTR_FORCE
| ATTR_SIZE
)) ==
1588 (ATTR_SIZE
| ATTR_MODE
)) &&
1589 (((mode
& S_ISUID
) && !(attr
->ia_mode
& S_ISUID
)) ||
1590 (((mode
& (S_ISGID
| 0010)) == (S_ISGID
| 0010)) &&
1591 !(attr
->ia_mode
& S_ISGID
))))
1592 attr
->ia_valid
|= ATTR_FORCE
;
1594 if ((attr
->ia_valid
& ATTR_MODE
) &&
1596 !(attr
->ia_mode
& S_ISUID
) &&
1597 !(attr
->ia_valid
& ATTR_KILL_SUID
))
1598 attr
->ia_valid
|= ATTR_KILL_SUID
;
1600 if ((attr
->ia_valid
& ATTR_MODE
) &&
1601 ((mode
& (S_ISGID
| 0010)) == (S_ISGID
| 0010)) &&
1602 !(attr
->ia_mode
& S_ISGID
) &&
1603 !(attr
->ia_valid
& ATTR_KILL_SGID
))
1604 attr
->ia_valid
|= ATTR_KILL_SGID
;
1606 return ll_setattr_raw(de
, attr
, false);
1609 int ll_statfs_internal(struct super_block
*sb
, struct obd_statfs
*osfs
,
1610 __u64 max_age
, __u32 flags
)
1612 struct ll_sb_info
*sbi
= ll_s2sbi(sb
);
1613 struct obd_statfs obd_osfs
;
1616 rc
= obd_statfs(NULL
, sbi
->ll_md_exp
, osfs
, max_age
, flags
);
1618 CERROR("md_statfs fails: rc = %d\n", rc
);
1622 osfs
->os_type
= sb
->s_magic
;
1624 CDEBUG(D_SUPER
, "MDC blocks %llu/%llu objects %llu/%llu\n",
1625 osfs
->os_bavail
, osfs
->os_blocks
, osfs
->os_ffree
,
1628 if (sbi
->ll_flags
& LL_SBI_LAZYSTATFS
)
1629 flags
|= OBD_STATFS_NODELAY
;
1631 rc
= obd_statfs_rqset(sbi
->ll_dt_exp
, &obd_osfs
, max_age
, flags
);
1633 CERROR("obd_statfs fails: rc = %d\n", rc
);
1637 CDEBUG(D_SUPER
, "OSC blocks %llu/%llu objects %llu/%llu\n",
1638 obd_osfs
.os_bavail
, obd_osfs
.os_blocks
, obd_osfs
.os_ffree
,
1641 osfs
->os_bsize
= obd_osfs
.os_bsize
;
1642 osfs
->os_blocks
= obd_osfs
.os_blocks
;
1643 osfs
->os_bfree
= obd_osfs
.os_bfree
;
1644 osfs
->os_bavail
= obd_osfs
.os_bavail
;
1646 /* If we don't have as many objects free on the OST as inodes
1647 * on the MDS, we reduce the total number of inodes to
1648 * compensate, so that the "inodes in use" number is correct.
1650 if (obd_osfs
.os_ffree
< osfs
->os_ffree
) {
1651 osfs
->os_files
= (osfs
->os_files
- osfs
->os_ffree
) +
1653 osfs
->os_ffree
= obd_osfs
.os_ffree
;
1659 int ll_statfs(struct dentry
*de
, struct kstatfs
*sfs
)
1661 struct super_block
*sb
= de
->d_sb
;
1662 struct obd_statfs osfs
;
1665 CDEBUG(D_VFSTRACE
, "VFS Op: at %llu jiffies\n", get_jiffies_64());
1666 ll_stats_ops_tally(ll_s2sbi(sb
), LPROC_LL_STAFS
, 1);
1668 /* Some amount of caching on the client is allowed */
1669 rc
= ll_statfs_internal(sb
, &osfs
,
1670 cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS
),
1675 statfs_unpack(sfs
, &osfs
);
1677 /* We need to downshift for all 32-bit kernels, because we can't
1678 * tell if the kernel is being called via sys_statfs64() or not.
1679 * Stop before overflowing f_bsize - in which case it is better
1680 * to just risk EOVERFLOW if caller is using old sys_statfs().
1682 if (sizeof(long) < 8) {
1683 while (osfs
.os_blocks
> ~0UL && sfs
->f_bsize
< 0x40000000) {
1686 osfs
.os_blocks
>>= 1;
1687 osfs
.os_bfree
>>= 1;
1688 osfs
.os_bavail
>>= 1;
1692 sfs
->f_blocks
= osfs
.os_blocks
;
1693 sfs
->f_bfree
= osfs
.os_bfree
;
1694 sfs
->f_bavail
= osfs
.os_bavail
;
1695 sfs
->f_fsid
= ll_s2sbi(sb
)->ll_fsid
;
1699 void ll_inode_size_lock(struct inode
*inode
)
1701 struct ll_inode_info
*lli
;
1703 LASSERT(!S_ISDIR(inode
->i_mode
));
1705 lli
= ll_i2info(inode
);
1706 mutex_lock(&lli
->lli_size_mutex
);
1709 void ll_inode_size_unlock(struct inode
*inode
)
1711 struct ll_inode_info
*lli
;
1713 lli
= ll_i2info(inode
);
1714 mutex_unlock(&lli
->lli_size_mutex
);
1717 int ll_update_inode(struct inode
*inode
, struct lustre_md
*md
)
1719 struct ll_inode_info
*lli
= ll_i2info(inode
);
1720 struct mdt_body
*body
= md
->body
;
1721 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
1723 if (body
->mbo_valid
& OBD_MD_FLEASIZE
)
1724 cl_file_inode_init(inode
, md
);
1726 if (S_ISDIR(inode
->i_mode
)) {
1729 rc
= ll_update_lsm_md(inode
, md
);
1734 #ifdef CONFIG_FS_POSIX_ACL
1735 if (body
->mbo_valid
& OBD_MD_FLACL
) {
1736 spin_lock(&lli
->lli_lock
);
1737 if (lli
->lli_posix_acl
)
1738 posix_acl_release(lli
->lli_posix_acl
);
1739 lli
->lli_posix_acl
= md
->posix_acl
;
1740 spin_unlock(&lli
->lli_lock
);
1743 inode
->i_ino
= cl_fid_build_ino(&body
->mbo_fid1
,
1744 sbi
->ll_flags
& LL_SBI_32BIT_API
);
1745 inode
->i_generation
= cl_fid_build_gen(&body
->mbo_fid1
);
1747 if (body
->mbo_valid
& OBD_MD_FLATIME
) {
1748 if (body
->mbo_atime
> LTIME_S(inode
->i_atime
))
1749 LTIME_S(inode
->i_atime
) = body
->mbo_atime
;
1750 lli
->lli_atime
= body
->mbo_atime
;
1752 if (body
->mbo_valid
& OBD_MD_FLMTIME
) {
1753 if (body
->mbo_mtime
> LTIME_S(inode
->i_mtime
)) {
1755 "setting ino %lu mtime from %lu to %llu\n",
1756 inode
->i_ino
, LTIME_S(inode
->i_mtime
),
1758 LTIME_S(inode
->i_mtime
) = body
->mbo_mtime
;
1760 lli
->lli_mtime
= body
->mbo_mtime
;
1762 if (body
->mbo_valid
& OBD_MD_FLCTIME
) {
1763 if (body
->mbo_ctime
> LTIME_S(inode
->i_ctime
))
1764 LTIME_S(inode
->i_ctime
) = body
->mbo_ctime
;
1765 lli
->lli_ctime
= body
->mbo_ctime
;
1767 if (body
->mbo_valid
& OBD_MD_FLMODE
)
1768 inode
->i_mode
= (inode
->i_mode
& S_IFMT
) |
1769 (body
->mbo_mode
& ~S_IFMT
);
1770 if (body
->mbo_valid
& OBD_MD_FLTYPE
)
1771 inode
->i_mode
= (inode
->i_mode
& ~S_IFMT
) |
1772 (body
->mbo_mode
& S_IFMT
);
1773 LASSERT(inode
->i_mode
!= 0);
1774 if (S_ISREG(inode
->i_mode
))
1775 inode
->i_blkbits
= min(PTLRPC_MAX_BRW_BITS
+ 1,
1776 LL_MAX_BLKSIZE_BITS
);
1778 inode
->i_blkbits
= inode
->i_sb
->s_blocksize_bits
;
1779 if (body
->mbo_valid
& OBD_MD_FLUID
)
1780 inode
->i_uid
= make_kuid(&init_user_ns
, body
->mbo_uid
);
1781 if (body
->mbo_valid
& OBD_MD_FLGID
)
1782 inode
->i_gid
= make_kgid(&init_user_ns
, body
->mbo_gid
);
1783 if (body
->mbo_valid
& OBD_MD_FLFLAGS
)
1784 inode
->i_flags
= ll_ext_to_inode_flags(body
->mbo_flags
);
1785 if (body
->mbo_valid
& OBD_MD_FLNLINK
)
1786 set_nlink(inode
, body
->mbo_nlink
);
1787 if (body
->mbo_valid
& OBD_MD_FLRDEV
)
1788 inode
->i_rdev
= old_decode_dev(body
->mbo_rdev
);
1790 if (body
->mbo_valid
& OBD_MD_FLID
) {
1791 /* FID shouldn't be changed! */
1792 if (fid_is_sane(&lli
->lli_fid
)) {
1793 LASSERTF(lu_fid_eq(&lli
->lli_fid
, &body
->mbo_fid1
),
1794 "Trying to change FID " DFID
" to the " DFID
", inode " DFID
"(%p)\n",
1795 PFID(&lli
->lli_fid
), PFID(&body
->mbo_fid1
),
1796 PFID(ll_inode2fid(inode
)), inode
);
1798 lli
->lli_fid
= body
->mbo_fid1
;
1802 LASSERT(fid_seq(&lli
->lli_fid
) != 0);
1804 if (body
->mbo_valid
& OBD_MD_FLSIZE
) {
1805 i_size_write(inode
, body
->mbo_size
);
1807 CDEBUG(D_VFSTRACE
, "inode=" DFID
", updating i_size %llu\n",
1808 PFID(ll_inode2fid(inode
)),
1809 (unsigned long long)body
->mbo_size
);
1811 if (body
->mbo_valid
& OBD_MD_FLBLOCKS
)
1812 inode
->i_blocks
= body
->mbo_blocks
;
1815 if (body
->mbo_valid
& OBD_MD_TSTATE
) {
1816 if (body
->mbo_t_state
& MS_RESTORE
)
1817 set_bit(LLIF_FILE_RESTORING
, &lli
->lli_flags
);
1823 int ll_read_inode2(struct inode
*inode
, void *opaque
)
1825 struct lustre_md
*md
= opaque
;
1826 struct ll_inode_info
*lli
= ll_i2info(inode
);
1829 CDEBUG(D_VFSTRACE
, "VFS Op:inode=" DFID
"(%p)\n",
1830 PFID(&lli
->lli_fid
), inode
);
1832 /* Core attributes from the MDS first. This is a new inode, and
1833 * the VFS doesn't zero times in the core inode so we have to do
1834 * it ourselves. They will be overwritten by either MDS or OST
1835 * attributes - we just need to make sure they aren't newer.
1837 LTIME_S(inode
->i_mtime
) = 0;
1838 LTIME_S(inode
->i_atime
) = 0;
1839 LTIME_S(inode
->i_ctime
) = 0;
1841 rc
= ll_update_inode(inode
, md
);
1845 /* OIDEBUG(inode); */
1847 if (S_ISREG(inode
->i_mode
)) {
1848 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
1850 inode
->i_op
= &ll_file_inode_operations
;
1851 inode
->i_fop
= sbi
->ll_fop
;
1852 inode
->i_mapping
->a_ops
= (struct address_space_operations
*)&ll_aops
;
1853 } else if (S_ISDIR(inode
->i_mode
)) {
1854 inode
->i_op
= &ll_dir_inode_operations
;
1855 inode
->i_fop
= &ll_dir_operations
;
1856 } else if (S_ISLNK(inode
->i_mode
)) {
1857 inode
->i_op
= &ll_fast_symlink_inode_operations
;
1859 inode
->i_op
= &ll_special_inode_operations
;
1861 init_special_inode(inode
, inode
->i_mode
,
1868 void ll_delete_inode(struct inode
*inode
)
1870 struct ll_inode_info
*lli
= ll_i2info(inode
);
1872 if (S_ISREG(inode
->i_mode
) && lli
->lli_clob
)
1873 /* discard all dirty pages before truncating them, required by
1874 * osc_extent implementation at LU-1030.
1876 cl_sync_file_range(inode
, 0, OBD_OBJECT_EOF
,
1879 truncate_inode_pages_final(&inode
->i_data
);
1881 LASSERTF(!inode
->i_data
.nrpages
,
1882 "inode=" DFID
"(%p) nrpages=%lu, see http://jira.whamcloud.com/browse/LU-118\n",
1883 PFID(ll_inode2fid(inode
)), inode
, inode
->i_data
.nrpages
);
1885 ll_clear_inode(inode
);
1889 int ll_iocontrol(struct inode
*inode
, struct file
*file
,
1890 unsigned int cmd
, unsigned long arg
)
1892 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
1893 struct ptlrpc_request
*req
= NULL
;
1897 case FSFILT_IOC_GETFLAGS
: {
1898 struct mdt_body
*body
;
1899 struct md_op_data
*op_data
;
1901 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
,
1902 0, 0, LUSTRE_OPC_ANY
,
1904 if (IS_ERR(op_data
))
1905 return PTR_ERR(op_data
);
1907 op_data
->op_valid
= OBD_MD_FLFLAGS
;
1908 rc
= md_getattr(sbi
->ll_md_exp
, op_data
, &req
);
1909 ll_finish_md_op_data(op_data
);
1911 CERROR("%s: failure inode " DFID
": rc = %d\n",
1912 sbi
->ll_md_exp
->exp_obd
->obd_name
,
1913 PFID(ll_inode2fid(inode
)), rc
);
1917 body
= req_capsule_server_get(&req
->rq_pill
, &RMF_MDT_BODY
);
1919 flags
= body
->mbo_flags
;
1921 ptlrpc_req_finished(req
);
1923 return put_user(flags
, (int __user
*)arg
);
1925 case FSFILT_IOC_SETFLAGS
: {
1926 struct md_op_data
*op_data
;
1927 struct cl_object
*obj
;
1930 if (get_user(flags
, (int __user
*)arg
))
1933 op_data
= ll_prep_md_op_data(NULL
, inode
, NULL
, NULL
, 0, 0,
1934 LUSTRE_OPC_ANY
, NULL
);
1935 if (IS_ERR(op_data
))
1936 return PTR_ERR(op_data
);
1938 op_data
->op_attr_flags
= flags
;
1939 op_data
->op_attr
.ia_valid
|= ATTR_ATTR_FLAG
;
1940 rc
= md_setattr(sbi
->ll_md_exp
, op_data
, NULL
, 0, &req
);
1941 ll_finish_md_op_data(op_data
);
1942 ptlrpc_req_finished(req
);
1946 inode
->i_flags
= ll_ext_to_inode_flags(flags
);
1948 obj
= ll_i2info(inode
)->lli_clob
;
1952 attr
= kzalloc(sizeof(*attr
), GFP_NOFS
);
1956 attr
->ia_valid
= ATTR_ATTR_FLAG
;
1957 rc
= cl_setattr_ost(obj
, attr
, flags
);
1968 int ll_flush_ctx(struct inode
*inode
)
1970 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
1972 CDEBUG(D_SEC
, "flush context for user %d\n",
1973 from_kuid(&init_user_ns
, current_uid()));
1975 obd_set_info_async(NULL
, sbi
->ll_md_exp
,
1976 sizeof(KEY_FLUSH_CTX
), KEY_FLUSH_CTX
,
1978 obd_set_info_async(NULL
, sbi
->ll_dt_exp
,
1979 sizeof(KEY_FLUSH_CTX
), KEY_FLUSH_CTX
,
1984 /* umount -f client means force down, don't save state */
1985 void ll_umount_begin(struct super_block
*sb
)
1987 struct ll_sb_info
*sbi
= ll_s2sbi(sb
);
1988 struct obd_device
*obd
;
1989 struct obd_ioctl_data
*ioc_data
;
1990 wait_queue_head_t waitq
;
1991 struct l_wait_info lwi
;
1993 CDEBUG(D_VFSTRACE
, "VFS Op: superblock %p count %d active %d\n", sb
,
1994 sb
->s_count
, atomic_read(&sb
->s_active
));
1996 obd
= class_exp2obd(sbi
->ll_md_exp
);
1998 CERROR("Invalid MDC connection handle %#llx\n",
1999 sbi
->ll_md_exp
->exp_handle
.h_cookie
);
2004 obd
= class_exp2obd(sbi
->ll_dt_exp
);
2006 CERROR("Invalid LOV connection handle %#llx\n",
2007 sbi
->ll_dt_exp
->exp_handle
.h_cookie
);
2012 ioc_data
= kzalloc(sizeof(*ioc_data
), GFP_NOFS
);
2014 obd_iocontrol(IOC_OSC_SET_ACTIVE
, sbi
->ll_md_exp
,
2015 sizeof(*ioc_data
), ioc_data
, NULL
);
2017 obd_iocontrol(IOC_OSC_SET_ACTIVE
, sbi
->ll_dt_exp
,
2018 sizeof(*ioc_data
), ioc_data
, NULL
);
2023 /* Really, we'd like to wait until there are no requests outstanding,
2024 * and then continue. For now, we just periodically checking for vfs
2025 * to decrement mnt_cnt and hope to finish it within 10sec.
2027 init_waitqueue_head(&waitq
);
2028 lwi
= LWI_TIMEOUT_INTERVAL(cfs_time_seconds(10),
2029 cfs_time_seconds(1), NULL
, NULL
);
2030 l_wait_event(waitq
, may_umount(sbi
->ll_mnt
.mnt
), &lwi
);
2035 int ll_remount_fs(struct super_block
*sb
, int *flags
, char *data
)
2037 struct ll_sb_info
*sbi
= ll_s2sbi(sb
);
2038 char *profilenm
= get_profile_name(sb
);
2042 if ((bool)(*flags
& MS_RDONLY
) != sb_rdonly(sb
)) {
2043 read_only
= *flags
& MS_RDONLY
;
2044 err
= obd_set_info_async(NULL
, sbi
->ll_md_exp
,
2045 sizeof(KEY_READ_ONLY
),
2046 KEY_READ_ONLY
, sizeof(read_only
),
2049 LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
2050 profilenm
, read_only
?
2051 "read-only" : "read-write", err
);
2056 sb
->s_flags
|= MS_RDONLY
;
2058 sb
->s_flags
&= ~MS_RDONLY
;
2060 if (sbi
->ll_flags
& LL_SBI_VERBOSE
)
2061 LCONSOLE_WARN("Remounted %s %s\n", profilenm
,
2062 read_only
? "read-only" : "read-write");
2068 * Cleanup the open handle that is cached on MDT-side.
2070 * For open case, the client side open handling thread may hit error
2071 * after the MDT grant the open. Under such case, the client should
2072 * send close RPC to the MDT as cleanup; otherwise, the open handle
2073 * on the MDT will be leaked there until the client umount or evicted.
2075 * In further, if someone unlinked the file, because the open handle
2076 * holds the reference on such file/object, then it will block the
2077 * subsequent threads that want to locate such object via FID.
2079 * \param[in] sb super block for this file-system
2080 * \param[in] open_req pointer to the original open request
2082 void ll_open_cleanup(struct super_block
*sb
, struct ptlrpc_request
*open_req
)
2084 struct mdt_body
*body
;
2085 struct md_op_data
*op_data
;
2086 struct ptlrpc_request
*close_req
= NULL
;
2087 struct obd_export
*exp
= ll_s2sbi(sb
)->ll_md_exp
;
2089 body
= req_capsule_server_get(&open_req
->rq_pill
, &RMF_MDT_BODY
);
2090 op_data
= kzalloc(sizeof(*op_data
), GFP_NOFS
);
2094 op_data
->op_fid1
= body
->mbo_fid1
;
2095 op_data
->op_handle
= body
->mbo_handle
;
2096 op_data
->op_mod_time
= get_seconds();
2097 md_close(exp
, op_data
, NULL
, &close_req
);
2098 ptlrpc_req_finished(close_req
);
2099 ll_finish_md_op_data(op_data
);
2102 int ll_prep_inode(struct inode
**inode
, struct ptlrpc_request
*req
,
2103 struct super_block
*sb
, struct lookup_intent
*it
)
2105 struct ll_sb_info
*sbi
= NULL
;
2106 struct lustre_md md
= { NULL
};
2109 LASSERT(*inode
|| sb
);
2110 sbi
= sb
? ll_s2sbi(sb
) : ll_i2sbi(*inode
);
2111 rc
= md_get_lustre_md(sbi
->ll_md_exp
, req
, sbi
->ll_dt_exp
,
2112 sbi
->ll_md_exp
, &md
);
2117 rc
= ll_update_inode(*inode
, &md
);
2124 * At this point server returns to client's same fid as client
2125 * generated for creating. So using ->fid1 is okay here.
2127 if (!fid_is_sane(&md
.body
->mbo_fid1
)) {
2128 CERROR("%s: Fid is insane " DFID
"\n",
2129 ll_get_fsname(sb
, NULL
, 0),
2130 PFID(&md
.body
->mbo_fid1
));
2135 *inode
= ll_iget(sb
, cl_fid_build_ino(&md
.body
->mbo_fid1
,
2136 sbi
->ll_flags
& LL_SBI_32BIT_API
),
2138 if (IS_ERR(*inode
)) {
2139 #ifdef CONFIG_FS_POSIX_ACL
2141 posix_acl_release(md
.posix_acl
);
2142 md
.posix_acl
= NULL
;
2146 CERROR("new_inode -fatal: rc %d\n", rc
);
2151 /* Handling piggyback layout lock.
2152 * Layout lock can be piggybacked by getattr and open request.
2153 * The lsm can be applied to inode only if it comes with a layout lock
2154 * otherwise correct layout may be overwritten, for example:
2155 * 1. proc1: mdt returns a lsm but not granting layout
2156 * 2. layout was changed by another client
2157 * 3. proc2: refresh layout and layout lock granted
2158 * 4. proc1: to apply a stale layout
2160 if (it
&& it
->it_lock_mode
!= 0) {
2161 struct lustre_handle lockh
;
2162 struct ldlm_lock
*lock
;
2164 lockh
.cookie
= it
->it_lock_handle
;
2165 lock
= ldlm_handle2lock(&lockh
);
2167 if (ldlm_has_layout(lock
)) {
2168 struct cl_object_conf conf
;
2170 memset(&conf
, 0, sizeof(conf
));
2171 conf
.coc_opc
= OBJECT_CONF_SET
;
2172 conf
.coc_inode
= *inode
;
2173 conf
.coc_lock
= lock
;
2174 conf
.u
.coc_layout
= md
.layout
;
2175 (void)ll_layout_conf(*inode
, &conf
);
2177 LDLM_LOCK_PUT(lock
);
2181 md_free_lustre_md(sbi
->ll_md_exp
, &md
);
2183 if (rc
!= 0 && it
&& it
->it_op
& IT_OPEN
)
2184 ll_open_cleanup(sb
? sb
: (*inode
)->i_sb
, req
);
2189 int ll_obd_statfs(struct inode
*inode
, void __user
*arg
)
2191 struct ll_sb_info
*sbi
= NULL
;
2192 struct obd_export
*exp
;
2194 struct obd_ioctl_data
*data
= NULL
;
2203 sbi
= ll_i2sbi(inode
);
2209 rc
= obd_ioctl_getdata(&buf
, &len
, arg
);
2214 if (!data
->ioc_inlbuf1
|| !data
->ioc_inlbuf2
||
2215 !data
->ioc_pbuf1
|| !data
->ioc_pbuf2
) {
2220 if (data
->ioc_inllen1
!= sizeof(__u32
) ||
2221 data
->ioc_inllen2
!= sizeof(__u32
) ||
2222 data
->ioc_plen1
!= sizeof(struct obd_statfs
) ||
2223 data
->ioc_plen2
!= sizeof(struct obd_uuid
)) {
2228 memcpy(&type
, data
->ioc_inlbuf1
, sizeof(__u32
));
2229 if (type
& LL_STATFS_LMV
) {
2230 exp
= sbi
->ll_md_exp
;
2231 } else if (type
& LL_STATFS_LOV
) {
2232 exp
= sbi
->ll_dt_exp
;
2238 rc
= obd_iocontrol(IOC_OBD_STATFS
, exp
, len
, buf
, NULL
);
2246 int ll_process_config(struct lustre_cfg
*lcfg
)
2250 struct lprocfs_static_vars lvars
;
2254 lprocfs_llite_init_vars(&lvars
);
2256 /* The instance name contains the sb: lustre-client-aacfe000 */
2257 ptr
= strrchr(lustre_cfg_string(lcfg
, 0), '-');
2258 if (!ptr
|| !*(++ptr
))
2260 rc
= kstrtoul(ptr
, 16, &x
);
2264 /* This better be a real Lustre superblock! */
2265 LASSERT(s2lsi((struct super_block
*)sb
)->lsi_lmd
->lmd_magic
==
2268 /* Note we have not called client_common_fill_super yet, so
2269 * proc fns must be able to handle that!
2271 rc
= class_process_proc_param(PARAM_LLITE
, lvars
.obd_vars
,
2278 /* this function prepares md_op_data hint for passing ot down to MD stack. */
2279 struct md_op_data
*ll_prep_md_op_data(struct md_op_data
*op_data
,
2280 struct inode
*i1
, struct inode
*i2
,
2281 const char *name
, size_t namelen
,
2282 u32 mode
, __u32 opc
, void *data
)
2285 /* Do not reuse namelen for something else. */
2287 return ERR_PTR(-EINVAL
);
2289 if (namelen
> ll_i2sbi(i1
)->ll_namelen
)
2290 return ERR_PTR(-ENAMETOOLONG
);
2292 if (!lu_name_is_valid_2(name
, namelen
))
2293 return ERR_PTR(-EINVAL
);
2297 op_data
= kzalloc(sizeof(*op_data
), GFP_NOFS
);
2300 return ERR_PTR(-ENOMEM
);
2302 ll_i2gids(op_data
->op_suppgids
, i1
, i2
);
2303 op_data
->op_fid1
= *ll_inode2fid(i1
);
2304 op_data
->op_default_stripe_offset
= -1;
2305 if (S_ISDIR(i1
->i_mode
)) {
2306 op_data
->op_mea1
= ll_i2info(i1
)->lli_lsm_md
;
2307 if (opc
== LUSTRE_OPC_MKDIR
)
2308 op_data
->op_default_stripe_offset
=
2309 ll_i2info(i1
)->lli_def_stripe_offset
;
2313 op_data
->op_fid2
= *ll_inode2fid(i2
);
2314 if (S_ISDIR(i2
->i_mode
))
2315 op_data
->op_mea2
= ll_i2info(i2
)->lli_lsm_md
;
2317 fid_zero(&op_data
->op_fid2
);
2320 if (ll_i2sbi(i1
)->ll_flags
& LL_SBI_64BIT_HASH
)
2321 op_data
->op_cli_flags
|= CLI_HASH64
;
2323 if (ll_need_32bit_api(ll_i2sbi(i1
)))
2324 op_data
->op_cli_flags
|= CLI_API32
;
2326 op_data
->op_name
= name
;
2327 op_data
->op_namelen
= namelen
;
2328 op_data
->op_mode
= mode
;
2329 op_data
->op_mod_time
= ktime_get_real_seconds();
2330 op_data
->op_fsuid
= from_kuid(&init_user_ns
, current_fsuid());
2331 op_data
->op_fsgid
= from_kgid(&init_user_ns
, current_fsgid());
2332 op_data
->op_cap
= cfs_curproc_cap_pack();
2333 if ((opc
== LUSTRE_OPC_CREATE
) && name
&&
2334 filename_is_volatile(name
, namelen
, &op_data
->op_mds
))
2335 op_data
->op_bias
|= MDS_CREATE_VOLATILE
;
2337 op_data
->op_mds
= 0;
2338 op_data
->op_data
= data
;
2343 void ll_finish_md_op_data(struct md_op_data
*op_data
)
2348 int ll_show_options(struct seq_file
*seq
, struct dentry
*dentry
)
2350 struct ll_sb_info
*sbi
;
2352 LASSERT(seq
&& dentry
);
2353 sbi
= ll_s2sbi(dentry
->d_sb
);
2355 if (sbi
->ll_flags
& LL_SBI_NOLCK
)
2356 seq_puts(seq
, ",nolock");
2358 if (sbi
->ll_flags
& LL_SBI_FLOCK
)
2359 seq_puts(seq
, ",flock");
2361 if (sbi
->ll_flags
& LL_SBI_LOCALFLOCK
)
2362 seq_puts(seq
, ",localflock");
2364 if (sbi
->ll_flags
& LL_SBI_USER_XATTR
)
2365 seq_puts(seq
, ",user_xattr");
2367 if (sbi
->ll_flags
& LL_SBI_LAZYSTATFS
)
2368 seq_puts(seq
, ",lazystatfs");
2370 if (sbi
->ll_flags
& LL_SBI_USER_FID2PATH
)
2371 seq_puts(seq
, ",user_fid2path");
2373 if (sbi
->ll_flags
& LL_SBI_ALWAYS_PING
)
2374 seq_puts(seq
, ",always_ping");
2380 * Get obd name by cmd, and copy out to user space
2382 int ll_get_obd_name(struct inode
*inode
, unsigned int cmd
, unsigned long arg
)
2384 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
2385 struct obd_device
*obd
;
2387 if (cmd
== OBD_IOC_GETDTNAME
)
2388 obd
= class_exp2obd(sbi
->ll_dt_exp
);
2389 else if (cmd
== OBD_IOC_GETMDNAME
)
2390 obd
= class_exp2obd(sbi
->ll_md_exp
);
2397 if (copy_to_user((void __user
*)arg
, obd
->obd_name
,
2398 strlen(obd
->obd_name
) + 1))
2405 * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the
2406 * fsname will be returned in this buffer; otherwise, a static buffer will be
2407 * used to store the fsname and returned to caller.
2409 char *ll_get_fsname(struct super_block
*sb
, char *buf
, int buflen
)
2411 static char fsname_static
[MTI_NAME_MAXLEN
];
2412 struct lustre_sb_info
*lsi
= s2lsi(sb
);
2417 /* this means the caller wants to use static buffer
2418 * and it doesn't care about race. Usually this is
2419 * in error reporting path
2421 buf
= fsname_static
;
2422 buflen
= sizeof(fsname_static
);
2425 len
= strlen(lsi
->lsi_lmd
->lmd_profile
);
2426 ptr
= strrchr(lsi
->lsi_lmd
->lmd_profile
, '-');
2427 if (ptr
&& (strcmp(ptr
, "-client") == 0))
2430 if (unlikely(len
>= buflen
))
2432 strncpy(buf
, lsi
->lsi_lmd
->lmd_profile
, len
);
2438 void ll_dirty_page_discard_warn(struct page
*page
, int ioret
)
2440 char *buf
, *path
= NULL
;
2441 struct dentry
*dentry
= NULL
;
2442 struct vvp_object
*obj
= cl_inode2vvp(page
->mapping
->host
);
2444 /* this can be called inside spin lock so use GFP_ATOMIC. */
2445 buf
= (char *)__get_free_page(GFP_ATOMIC
);
2447 dentry
= d_find_alias(page
->mapping
->host
);
2449 path
= dentry_path_raw(dentry
, buf
, PAGE_SIZE
);
2453 "%s: dirty page discard: %s/fid: " DFID
"/%s may get corrupted (rc %d)\n",
2454 ll_get_fsname(page
->mapping
->host
->i_sb
, NULL
, 0),
2455 s2lsi(page
->mapping
->host
->i_sb
)->lsi_lmd
->lmd_dev
,
2456 PFID(&obj
->vob_header
.coh_lu
.loh_fid
),
2457 (path
&& !IS_ERR(path
)) ? path
: "", ioret
);
2463 free_page((unsigned long)buf
);
2466 ssize_t
ll_copy_user_md(const struct lov_user_md __user
*md
,
2467 struct lov_user_md
**kbuf
)
2469 struct lov_user_md lum
;
2472 if (copy_from_user(&lum
, md
, sizeof(lum
))) {
2477 lum_size
= ll_lov_user_md_size(&lum
);
2481 *kbuf
= kzalloc(lum_size
, GFP_NOFS
);
2487 if (copy_from_user(*kbuf
, md
, lum_size
) != 0) {
2497 * Compute llite root squash state after a change of root squash
2498 * configuration setting or add/remove of a lnet nid
2500 void ll_compute_rootsquash_state(struct ll_sb_info
*sbi
)
2502 struct root_squash_info
*squash
= &sbi
->ll_squash
;
2503 struct lnet_process_id id
;
2507 /* Update norootsquash flag */
2508 down_write(&squash
->rsi_sem
);
2509 if (list_empty(&squash
->rsi_nosquash_nids
)) {
2510 sbi
->ll_flags
&= ~LL_SBI_NOROOTSQUASH
;
2513 * Do not apply root squash as soon as one of our NIDs is
2514 * in the nosquash_nids list
2519 while (LNetGetId(i
++, &id
) != -ENOENT
) {
2520 if (LNET_NETTYP(LNET_NIDNET(id
.nid
)) == LOLND
)
2522 if (cfs_match_nid(id
.nid
, &squash
->rsi_nosquash_nids
)) {
2528 sbi
->ll_flags
|= LL_SBI_NOROOTSQUASH
;
2530 sbi
->ll_flags
&= ~LL_SBI_NOROOTSQUASH
;
2532 up_write(&squash
->rsi_sem
);
2536 * Parse linkea content to extract information about a given hardlink
2538 * \param[in] ldata - Initialized linkea data
2539 * \param[in] linkno - Link identifier
2540 * \param[out] parent_fid - The entry's parent FID
2541 * \param[in] size - Entry name destination buffer
2543 * \retval 0 on success
2544 * \retval Appropriate negative error code on failure
2546 static int ll_linkea_decode(struct linkea_data
*ldata
, unsigned int linkno
,
2547 struct lu_fid
*parent_fid
, struct lu_name
*ln
)
2552 rc
= linkea_init_with_rec(ldata
);
2556 if (linkno
>= ldata
->ld_leh
->leh_reccount
)
2557 /* beyond last link */
2560 linkea_first_entry(ldata
);
2561 for (idx
= 0; ldata
->ld_lee
; idx
++) {
2562 linkea_entry_unpack(ldata
->ld_lee
, &ldata
->ld_reclen
, ln
,
2567 linkea_next_entry(ldata
);
2577 * Get parent FID and name of an identified link. Operation is performed for
2578 * a given link number, letting the caller iterate over linkno to list one or
2579 * all links of an entry.
2581 * \param[in] file - File descriptor against which to perform the operation
2582 * \param[in,out] arg - User-filled structure containing the linkno to operate
2583 * on and the available size. It is eventually filled
2584 * with the requested information or left untouched on
2587 * \retval - 0 on success
2588 * \retval - Appropriate negative error code on failure
2590 int ll_getparent(struct file
*file
, struct getparent __user
*arg
)
2592 struct inode
*inode
= file_inode(file
);
2593 struct linkea_data
*ldata
;
2594 struct lu_fid parent_fid
;
2595 struct lu_buf buf
= {
2604 if (!capable(CFS_CAP_DAC_READ_SEARCH
) &&
2605 !(ll_i2sbi(inode
)->ll_flags
& LL_SBI_USER_FID2PATH
))
2608 if (get_user(name_size
, &arg
->gp_name_size
))
2611 if (get_user(linkno
, &arg
->gp_linkno
))
2614 if (name_size
> PATH_MAX
)
2617 ldata
= kzalloc(sizeof(*ldata
), GFP_NOFS
);
2621 rc
= linkea_data_new(ldata
, &buf
);
2625 rc
= ll_xattr_list(inode
, XATTR_NAME_LINK
, XATTR_TRUSTED_T
, buf
.lb_buf
,
2626 buf
.lb_len
, OBD_MD_FLXATTR
);
2630 rc
= ll_linkea_decode(ldata
, linkno
, &parent_fid
, &ln
);
2634 if (ln
.ln_namelen
>= name_size
) {
2639 if (copy_to_user(&arg
->gp_fid
, &parent_fid
, sizeof(arg
->gp_fid
))) {
2644 if (copy_to_user(&arg
->gp_name
, ln
.ln_name
, ln
.ln_namelen
)) {
2649 if (put_user('\0', arg
->gp_name
+ ln
.ln_namelen
)) {