4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
37 #include <linux/kernel.h>
39 #include <linux/string.h>
40 #include <linux/stat.h>
41 #include <linux/errno.h>
42 #include <linux/unistd.h>
43 #include <linux/version.h>
44 #include <asm/uaccess.h>
47 #include <linux/stat.h>
48 #include <asm/uaccess.h>
50 #include <linux/pagemap.h>
52 #define DEBUG_SUBSYSTEM S_LLITE
54 #include <lustre_lite.h>
55 #include "llite_internal.h"
56 #include <linux/lustre_compat25.h>
58 struct page
*ll_nopage(struct vm_area_struct
*vma
, unsigned long address
,
61 static struct vm_operations_struct ll_file_vm_ops
;
63 void policy_from_vma(ldlm_policy_data_t
*policy
,
64 struct vm_area_struct
*vma
, unsigned long addr
,
67 policy
->l_extent
.start
= ((addr
- vma
->vm_start
) & CFS_PAGE_MASK
) +
68 (vma
->vm_pgoff
<< PAGE_CACHE_SHIFT
);
69 policy
->l_extent
.end
= (policy
->l_extent
.start
+ count
- 1) |
73 struct vm_area_struct
*our_vma(struct mm_struct
*mm
, unsigned long addr
,
76 struct vm_area_struct
*vma
, *ret
= NULL
;
79 /* mmap_sem must have been held by caller. */
80 LASSERT(!down_write_trylock(&mm
->mmap_sem
));
82 for(vma
= find_vma(mm
, addr
);
83 vma
!= NULL
&& vma
->vm_start
< (addr
+ count
); vma
= vma
->vm_next
) {
84 if (vma
->vm_ops
&& vma
->vm_ops
== &ll_file_vm_ops
&&
85 vma
->vm_flags
& VM_SHARED
) {
94 * API independent part for page fault initialization.
95 * \param vma - virtual memory area addressed to page fault
96 * \param env - corespondent lu_env to processing
97 * \param nest - nested level
98 * \param index - page index corespondent to fault.
99 * \parm ra_flags - vma readahead flags.
101 * \return allocated and initialized env for fault operation.
102 * \retval EINVAL if env can't allocated
103 * \return other error codes from cl_io_init.
105 struct cl_io
*ll_fault_io_init(struct vm_area_struct
*vma
,
106 struct lu_env
**env_ret
,
107 struct cl_env_nest
*nest
,
108 pgoff_t index
, unsigned long *ra_flags
)
110 struct file
*file
= vma
->vm_file
;
111 struct inode
*inode
= file
->f_dentry
->d_inode
;
113 struct cl_fault_io
*fio
;
118 if (ll_file_nolock(file
))
119 RETURN(ERR_PTR(-EOPNOTSUPP
));
122 * page fault can be called when lustre IO is
123 * already active for the current thread, e.g., when doing read/write
124 * against user level buffer mapped from Lustre buffer. To avoid
125 * stomping on existing context, optionally force an allocation of a new
128 env
= cl_env_nested_get(nest
);
130 RETURN(ERR_PTR(-EINVAL
));
134 io
= ccc_env_thread_io(env
);
135 io
->ci_obj
= ll_i2info(inode
)->lli_clob
;
136 LASSERT(io
->ci_obj
!= NULL
);
138 fio
= &io
->u
.ci_fault
;
139 fio
->ft_index
= index
;
140 fio
->ft_executable
= vma
->vm_flags
&VM_EXEC
;
143 * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
144 * the kernel will not read other pages not covered by ldlm in
145 * filemap_nopage. we do our readahead in ll_readpage.
147 if (ra_flags
!= NULL
)
148 *ra_flags
= vma
->vm_flags
& (VM_RAND_READ
|VM_SEQ_READ
);
149 vma
->vm_flags
&= ~VM_SEQ_READ
;
150 vma
->vm_flags
|= VM_RAND_READ
;
152 CDEBUG(D_MMAP
, "vm_flags: %lx (%lu %d)\n", vma
->vm_flags
,
153 fio
->ft_index
, fio
->ft_executable
);
155 if (cl_io_init(env
, io
, CIT_FAULT
, io
->ci_obj
) == 0) {
156 struct ccc_io
*cio
= ccc_env_io(env
);
157 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
159 LASSERT(cio
->cui_cl
.cis_io
== io
);
161 /* mmap lock must be MANDATORY
162 * it has to cache pages. */
163 io
->ci_lockreq
= CILR_MANDATORY
;
171 /* Sharing code of page_mkwrite method for rhel5 and rhel6 */
172 static int ll_page_mkwrite0(struct vm_area_struct
*vma
, struct page
*vmpage
,
178 struct cl_env_nest nest
;
182 struct ll_inode_info
*lli
;
185 LASSERT(vmpage
!= NULL
);
187 io
= ll_fault_io_init(vma
, &env
, &nest
, vmpage
->index
, NULL
);
189 GOTO(out
, result
= PTR_ERR(io
));
191 result
= io
->ci_result
;
195 io
->u
.ci_fault
.ft_mkwrite
= 1;
196 io
->u
.ci_fault
.ft_writable
= 1;
198 vio
= vvp_env_io(env
);
199 vio
->u
.fault
.ft_vma
= vma
;
200 vio
->u
.fault
.ft_vmpage
= vmpage
;
202 set
= cfs_block_sigsinv(sigmask(SIGKILL
) | sigmask(SIGTERM
));
204 /* we grab lli_trunc_sem to exclude truncate case.
205 * Otherwise, we could add dirty pages into osc cache
206 * while truncate is on-going. */
207 inode
= ccc_object_inode(io
->ci_obj
);
208 lli
= ll_i2info(inode
);
209 down_read(&lli
->lli_trunc_sem
);
211 result
= cl_io_loop(env
, io
);
213 up_read(&lli
->lli_trunc_sem
);
215 cfs_restore_sigs(set
);
218 struct inode
*inode
= vma
->vm_file
->f_dentry
->d_inode
;
219 struct ll_inode_info
*lli
= ll_i2info(inode
);
222 if (vmpage
->mapping
== NULL
) {
225 /* page was truncated and lock was cancelled, return
226 * ENODATA so that VM_FAULT_NOPAGE will be returned
227 * to handle_mm_fault(). */
230 } else if (!PageDirty(vmpage
)) {
231 /* race, the page has been cleaned by ptlrpcd after
232 * it was unlocked, it has to be added into dirty
233 * cache again otherwise this soon-to-dirty page won't
234 * consume any grants, even worse if this page is being
235 * transferred because it will break RPC checksum.
239 CDEBUG(D_MMAP
, "Race on page_mkwrite %p/%lu, page has "
240 "been written out, retry.\n",
241 vmpage
, vmpage
->index
);
248 spin_lock(&lli
->lli_lock
);
249 lli
->lli_flags
|= LLIF_DATA_MODIFIED
;
250 spin_unlock(&lli
->lli_lock
);
257 cl_env_nested_put(&nest
, env
);
259 CDEBUG(D_MMAP
, "%s mkwrite with %d\n", current
->comm
, result
);
261 LASSERT(ergo(result
== 0, PageLocked(vmpage
)));
267 static inline int to_fault_error(int result
)
271 result
= VM_FAULT_LOCKED
;
274 result
= VM_FAULT_NOPAGE
;
277 result
= VM_FAULT_OOM
;
280 result
= VM_FAULT_SIGBUS
;
287 * Lustre implementation of a vm_operations_struct::fault() method, called by
288 * VM to server page fault (both in kernel and user space).
290 * \param vma - is virtiual area struct related to page fault
291 * \param vmf - structure which describe type and address where hit fault
293 * \return allocated and filled _locked_ page for address
294 * \retval VM_FAULT_ERROR on general error
295 * \retval NOPAGE_OOM not have memory for allocate new page
297 static int ll_fault0(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
301 struct vvp_io
*vio
= NULL
;
303 unsigned long ra_flags
;
304 struct cl_env_nest nest
;
309 io
= ll_fault_io_init(vma
, &env
, &nest
, vmf
->pgoff
, &ra_flags
);
311 RETURN(to_fault_error(PTR_ERR(io
)));
313 result
= io
->ci_result
;
315 vio
= vvp_env_io(env
);
316 vio
->u
.fault
.ft_vma
= vma
;
317 vio
->u
.fault
.ft_vmpage
= NULL
;
318 vio
->u
.fault
.fault
.ft_vmf
= vmf
;
320 result
= cl_io_loop(env
, io
);
322 fault_ret
= vio
->u
.fault
.fault
.ft_flags
;
323 vmpage
= vio
->u
.fault
.ft_vmpage
;
324 if (result
!= 0 && vmpage
!= NULL
) {
325 page_cache_release(vmpage
);
330 cl_env_nested_put(&nest
, env
);
332 vma
->vm_flags
|= ra_flags
;
333 if (result
!= 0 && !(fault_ret
& VM_FAULT_RETRY
))
334 fault_ret
|= to_fault_error(result
);
336 CDEBUG(D_MMAP
, "%s fault %d/%d\n",
337 current
->comm
, fault_ret
, result
);
341 static int ll_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
344 bool printed
= false;
348 /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
349 * so that it can be killed by admin but not cause segfault by
351 set
= cfs_block_sigsinv(sigmask(SIGKILL
) | sigmask(SIGTERM
));
354 result
= ll_fault0(vma
, vmf
);
355 LASSERT(!(result
& VM_FAULT_LOCKED
));
357 struct page
*vmpage
= vmf
->page
;
359 /* check if this page has been truncated */
361 if (unlikely(vmpage
->mapping
== NULL
)) { /* unlucky */
363 page_cache_release(vmpage
);
366 if (!printed
&& ++count
> 16) {
367 CWARN("the page is under heavy contention,"
368 "maybe your app(%s) needs revising :-)\n",
376 result
|= VM_FAULT_LOCKED
;
378 cfs_restore_sigs(set
);
382 static int ll_page_mkwrite(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
385 bool printed
= false;
391 result
= ll_page_mkwrite0(vma
, vmf
->page
, &retry
);
393 if (!printed
&& ++count
> 16) {
394 CWARN("app(%s): the page %lu of file %lu is under heavy"
396 current
->comm
, vmf
->pgoff
,
397 vma
->vm_file
->f_dentry
->d_inode
->i_ino
);
404 LASSERT(PageLocked(vmf
->page
));
405 result
= VM_FAULT_LOCKED
;
409 result
= VM_FAULT_NOPAGE
;
412 result
= VM_FAULT_OOM
;
415 result
= VM_FAULT_RETRY
;
418 result
= VM_FAULT_SIGBUS
;
426 * To avoid cancel the locks covering mmapped region for lock cache pressure,
427 * we track the mapped vma count in ccc_object::cob_mmap_cnt.
429 static void ll_vm_open(struct vm_area_struct
* vma
)
431 struct inode
*inode
= vma
->vm_file
->f_dentry
->d_inode
;
432 struct ccc_object
*vob
= cl_inode2ccc(inode
);
435 LASSERT(vma
->vm_file
);
436 LASSERT(atomic_read(&vob
->cob_mmap_cnt
) >= 0);
437 atomic_inc(&vob
->cob_mmap_cnt
);
442 * Dual to ll_vm_open().
444 static void ll_vm_close(struct vm_area_struct
*vma
)
446 struct inode
*inode
= vma
->vm_file
->f_dentry
->d_inode
;
447 struct ccc_object
*vob
= cl_inode2ccc(inode
);
450 LASSERT(vma
->vm_file
);
451 atomic_dec(&vob
->cob_mmap_cnt
);
452 LASSERT(atomic_read(&vob
->cob_mmap_cnt
) >= 0);
457 /* return the user space pointer that maps to a file offset via a vma */
458 static inline unsigned long file_to_user(struct vm_area_struct
*vma
, __u64 byte
)
460 return vma
->vm_start
+ (byte
- ((__u64
)vma
->vm_pgoff
<< PAGE_CACHE_SHIFT
));
464 /* XXX put nice comment here. talk about __free_pte -> dirty pages and
465 * nopage's reference passing to the pte */
466 int ll_teardown_mmaps(struct address_space
*mapping
, __u64 first
, __u64 last
)
471 LASSERTF(last
> first
, "last "LPU64
" first "LPU64
"\n", last
, first
);
472 if (mapping_mapped(mapping
)) {
474 unmap_mapping_range(mapping
, first
+ PAGE_CACHE_SIZE
- 1,
475 last
- first
+ 1, 0);
481 static struct vm_operations_struct ll_file_vm_ops
= {
483 .page_mkwrite
= ll_page_mkwrite
,
485 .close
= ll_vm_close
,
488 int ll_file_mmap(struct file
*file
, struct vm_area_struct
* vma
)
490 struct inode
*inode
= file
->f_dentry
->d_inode
;
494 if (ll_file_nolock(file
))
497 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_MAP
, 1);
498 rc
= generic_file_mmap(file
, vma
);
500 vma
->vm_ops
= &ll_file_vm_ops
;
501 vma
->vm_ops
->open(vma
);
502 /* update the inode's size and mtime */
503 rc
= ll_glimpse_size(inode
);