4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2015, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
37 #include <linux/kernel.h>
39 #include <linux/string.h>
40 #include <linux/stat.h>
41 #include <linux/errno.h>
42 #include <linux/unistd.h>
43 #include <linux/uaccess.h>
46 #include <linux/pagemap.h>
48 #define DEBUG_SUBSYSTEM S_LLITE
50 #include "../include/lustre_lite.h"
51 #include "llite_internal.h"
52 #include "../include/linux/lustre_compat25.h"
54 static const struct vm_operations_struct ll_file_vm_ops
;
56 void policy_from_vma(ldlm_policy_data_t
*policy
,
57 struct vm_area_struct
*vma
, unsigned long addr
,
60 policy
->l_extent
.start
= ((addr
- vma
->vm_start
) & CFS_PAGE_MASK
) +
61 (vma
->vm_pgoff
<< PAGE_SHIFT
);
62 policy
->l_extent
.end
= (policy
->l_extent
.start
+ count
- 1) |
66 struct vm_area_struct
*our_vma(struct mm_struct
*mm
, unsigned long addr
,
69 struct vm_area_struct
*vma
, *ret
= NULL
;
71 /* mmap_sem must have been held by caller. */
72 LASSERT(!down_write_trylock(&mm
->mmap_sem
));
74 for (vma
= find_vma(mm
, addr
);
75 vma
&& vma
->vm_start
< (addr
+ count
); vma
= vma
->vm_next
) {
76 if (vma
->vm_ops
&& vma
->vm_ops
== &ll_file_vm_ops
&&
77 vma
->vm_flags
& VM_SHARED
) {
86 * API independent part for page fault initialization.
87 * \param vma - virtual memory area addressed to page fault
88 * \param env - corespondent lu_env to processing
89 * \param nest - nested level
90 * \param index - page index corespondent to fault.
91 * \parm ra_flags - vma readahead flags.
93 * \return allocated and initialized env for fault operation.
94 * \retval EINVAL if env can't allocated
95 * \return other error codes from cl_io_init.
98 ll_fault_io_init(struct vm_area_struct
*vma
, struct lu_env
**env_ret
,
99 struct cl_env_nest
*nest
, pgoff_t index
,
100 unsigned long *ra_flags
)
102 struct file
*file
= vma
->vm_file
;
103 struct inode
*inode
= file_inode(file
);
105 struct cl_fault_io
*fio
;
110 if (ll_file_nolock(file
))
111 return ERR_PTR(-EOPNOTSUPP
);
114 * page fault can be called when lustre IO is
115 * already active for the current thread, e.g., when doing read/write
116 * against user level buffer mapped from Lustre buffer. To avoid
117 * stomping on existing context, optionally force an allocation of a new
120 env
= cl_env_nested_get(nest
);
122 return ERR_PTR(-EINVAL
);
126 io
= ccc_env_thread_io(env
);
127 io
->ci_obj
= ll_i2info(inode
)->lli_clob
;
130 fio
= &io
->u
.ci_fault
;
131 fio
->ft_index
= index
;
132 fio
->ft_executable
= vma
->vm_flags
&VM_EXEC
;
135 * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
136 * the kernel will not read other pages not covered by ldlm in
137 * filemap_nopage. we do our readahead in ll_readpage.
140 *ra_flags
= vma
->vm_flags
& (VM_RAND_READ
|VM_SEQ_READ
);
141 vma
->vm_flags
&= ~VM_SEQ_READ
;
142 vma
->vm_flags
|= VM_RAND_READ
;
144 CDEBUG(D_MMAP
, "vm_flags: %lx (%lu %d)\n", vma
->vm_flags
,
145 fio
->ft_index
, fio
->ft_executable
);
147 rc
= cl_io_init(env
, io
, CIT_FAULT
, io
->ci_obj
);
149 struct ccc_io
*cio
= ccc_env_io(env
);
150 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
152 LASSERT(cio
->cui_cl
.cis_io
== io
);
154 /* mmap lock must be MANDATORY it has to cache pages. */
155 io
->ci_lockreq
= CILR_MANDATORY
;
160 cl_env_nested_put(nest
, env
);
167 /* Sharing code of page_mkwrite method for rhel5 and rhel6 */
168 static int ll_page_mkwrite0(struct vm_area_struct
*vma
, struct page
*vmpage
,
174 struct cl_env_nest nest
;
178 struct ll_inode_info
*lli
;
180 io
= ll_fault_io_init(vma
, &env
, &nest
, vmpage
->index
, NULL
);
182 result
= PTR_ERR(io
);
186 result
= io
->ci_result
;
190 io
->u
.ci_fault
.ft_mkwrite
= 1;
191 io
->u
.ci_fault
.ft_writable
= 1;
193 vio
= vvp_env_io(env
);
194 vio
->u
.fault
.ft_vma
= vma
;
195 vio
->u
.fault
.ft_vmpage
= vmpage
;
197 set
= cfs_block_sigsinv(sigmask(SIGKILL
) | sigmask(SIGTERM
));
199 /* we grab lli_trunc_sem to exclude truncate case.
200 * Otherwise, we could add dirty pages into osc cache
201 * while truncate is on-going.
203 inode
= ccc_object_inode(io
->ci_obj
);
204 lli
= ll_i2info(inode
);
205 down_read(&lli
->lli_trunc_sem
);
207 result
= cl_io_loop(env
, io
);
209 up_read(&lli
->lli_trunc_sem
);
211 cfs_restore_sigs(set
);
214 struct inode
*inode
= file_inode(vma
->vm_file
);
215 struct ll_inode_info
*lli
= ll_i2info(inode
);
218 if (!vmpage
->mapping
) {
221 /* page was truncated and lock was cancelled, return
222 * ENODATA so that VM_FAULT_NOPAGE will be returned
223 * to handle_mm_fault().
227 } else if (!PageDirty(vmpage
)) {
228 /* race, the page has been cleaned by ptlrpcd after
229 * it was unlocked, it has to be added into dirty
230 * cache again otherwise this soon-to-dirty page won't
231 * consume any grants, even worse if this page is being
232 * transferred because it will break RPC checksum.
236 CDEBUG(D_MMAP
, "Race on page_mkwrite %p/%lu, page has been written out, retry.\n",
237 vmpage
, vmpage
->index
);
244 spin_lock(&lli
->lli_lock
);
245 lli
->lli_flags
|= LLIF_DATA_MODIFIED
;
246 spin_unlock(&lli
->lli_lock
);
252 cl_env_nested_put(&nest
, env
);
254 CDEBUG(D_MMAP
, "%s mkwrite with %d\n", current
->comm
, result
);
255 LASSERT(ergo(result
== 0, PageLocked(vmpage
)));
260 static inline int to_fault_error(int result
)
264 result
= VM_FAULT_LOCKED
;
267 result
= VM_FAULT_NOPAGE
;
270 result
= VM_FAULT_OOM
;
273 result
= VM_FAULT_SIGBUS
;
280 * Lustre implementation of a vm_operations_struct::fault() method, called by
281 * VM to server page fault (both in kernel and user space).
283 * \param vma - is virtual area struct related to page fault
284 * \param vmf - structure which describe type and address where hit fault
286 * \return allocated and filled _locked_ page for address
287 * \retval VM_FAULT_ERROR on general error
288 * \retval NOPAGE_OOM not have memory for allocate new page
290 static int ll_fault0(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
294 struct vvp_io
*vio
= NULL
;
296 unsigned long ra_flags
;
297 struct cl_env_nest nest
;
301 io
= ll_fault_io_init(vma
, &env
, &nest
, vmf
->pgoff
, &ra_flags
);
303 return to_fault_error(PTR_ERR(io
));
305 result
= io
->ci_result
;
307 vio
= vvp_env_io(env
);
308 vio
->u
.fault
.ft_vma
= vma
;
309 vio
->u
.fault
.ft_vmpage
= NULL
;
310 vio
->u
.fault
.fault
.ft_vmf
= vmf
;
311 vio
->u
.fault
.fault
.ft_flags
= 0;
312 vio
->u
.fault
.fault
.ft_flags_valid
= false;
314 result
= cl_io_loop(env
, io
);
316 /* ft_flags are only valid if we reached
317 * the call to filemap_fault
319 if (vio
->u
.fault
.fault
.ft_flags_valid
)
320 fault_ret
= vio
->u
.fault
.fault
.ft_flags
;
322 vmpage
= vio
->u
.fault
.ft_vmpage
;
323 if (result
!= 0 && vmpage
) {
329 cl_env_nested_put(&nest
, env
);
331 vma
->vm_flags
|= ra_flags
;
332 if (result
!= 0 && !(fault_ret
& VM_FAULT_RETRY
))
333 fault_ret
|= to_fault_error(result
);
335 CDEBUG(D_MMAP
, "%s fault %d/%d\n",
336 current
->comm
, fault_ret
, result
);
340 static int ll_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
343 bool printed
= false;
347 /* Only SIGKILL and SIGTERM are allowed for fault/nopage/mkwrite
348 * so that it can be killed by admin but not cause segfault by
351 set
= cfs_block_sigsinv(sigmask(SIGKILL
) | sigmask(SIGTERM
));
354 result
= ll_fault0(vma
, vmf
);
355 LASSERT(!(result
& VM_FAULT_LOCKED
));
357 struct page
*vmpage
= vmf
->page
;
359 /* check if this page has been truncated */
361 if (unlikely(!vmpage
->mapping
)) { /* unlucky */
366 if (!printed
&& ++count
> 16) {
367 CWARN("the page is under heavy contention, maybe your app(%s) needs revising :-)\n",
375 result
= VM_FAULT_LOCKED
;
377 cfs_restore_sigs(set
);
381 static int ll_page_mkwrite(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
384 bool printed
= false;
390 result
= ll_page_mkwrite0(vma
, vmf
->page
, &retry
);
392 if (!printed
&& ++count
> 16) {
393 CWARN("app(%s): the page %lu of file %lu is under heavy contention.\n",
394 current
->comm
, vmf
->pgoff
,
395 file_inode(vma
->vm_file
)->i_ino
);
402 LASSERT(PageLocked(vmf
->page
));
403 result
= VM_FAULT_LOCKED
;
407 result
= VM_FAULT_NOPAGE
;
410 result
= VM_FAULT_OOM
;
413 result
= VM_FAULT_RETRY
;
416 result
= VM_FAULT_SIGBUS
;
424 * To avoid cancel the locks covering mmapped region for lock cache pressure,
425 * we track the mapped vma count in ccc_object::cob_mmap_cnt.
427 static void ll_vm_open(struct vm_area_struct
*vma
)
429 struct inode
*inode
= file_inode(vma
->vm_file
);
430 struct ccc_object
*vob
= cl_inode2ccc(inode
);
432 LASSERT(vma
->vm_file
);
433 LASSERT(atomic_read(&vob
->cob_mmap_cnt
) >= 0);
434 atomic_inc(&vob
->cob_mmap_cnt
);
438 * Dual to ll_vm_open().
440 static void ll_vm_close(struct vm_area_struct
*vma
)
442 struct inode
*inode
= file_inode(vma
->vm_file
);
443 struct ccc_object
*vob
= cl_inode2ccc(inode
);
445 LASSERT(vma
->vm_file
);
446 atomic_dec(&vob
->cob_mmap_cnt
);
447 LASSERT(atomic_read(&vob
->cob_mmap_cnt
) >= 0);
450 /* XXX put nice comment here. talk about __free_pte -> dirty pages and
451 * nopage's reference passing to the pte
453 int ll_teardown_mmaps(struct address_space
*mapping
, __u64 first
, __u64 last
)
457 LASSERTF(last
> first
, "last %llu first %llu\n", last
, first
);
458 if (mapping_mapped(mapping
)) {
460 unmap_mapping_range(mapping
, first
+ PAGE_SIZE
- 1,
461 last
- first
+ 1, 0);
467 static const struct vm_operations_struct ll_file_vm_ops
= {
469 .page_mkwrite
= ll_page_mkwrite
,
471 .close
= ll_vm_close
,
474 int ll_file_mmap(struct file
*file
, struct vm_area_struct
*vma
)
476 struct inode
*inode
= file_inode(file
);
479 if (ll_file_nolock(file
))
482 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_MAP
, 1);
483 rc
= generic_file_mmap(file
, vma
);
485 vma
->vm_ops
= &ll_file_vm_ops
;
486 vma
->vm_ops
->open(vma
);
487 /* update the inode's size and mtime */
488 rc
= ll_glimpse_size(inode
);