--- zzzz-none-000/linux-3.10.107/fs/hugetlbfs/inode.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/hugetlbfs/inode.c 2021-02-04 17:41:59.000000000 +0000 @@ -6,10 +6,13 @@ * Copyright (C) 2002 Linus Torvalds. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include #include /* remove ASAP */ +#include #include #include #include @@ -32,6 +35,7 @@ #include #include #include +#include #include @@ -45,9 +49,10 @@ kuid_t uid; kgid_t gid; umode_t mode; - long nr_blocks; + long max_hpages; long nr_inodes; struct hstate *hstate; + long min_hpages; }; struct hugetlbfs_inode_info { @@ -60,18 +65,12 @@ return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); } -static struct backing_dev_info hugetlbfs_backing_dev_info = { - .name = "hugetlbfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - int sysctl_hugetlb_shm_group; enum { Opt_size, Opt_nr_inodes, Opt_mode, Opt_uid, Opt_gid, - Opt_pagesize, + Opt_pagesize, Opt_min_size, Opt_err, }; @@ -82,9 +81,33 @@ {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_pagesize, "pagesize=%s"}, + {Opt_min_size, "min_size=%s"}, {Opt_err, NULL}, }; +#ifdef CONFIG_NUMA +static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, + index); +} + +static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) +{ + mpol_cond_put(vma->vm_policy); +} +#else +static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ +} + +static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) +{ +} +#endif + static void huge_pagevec_release(struct pagevec *pvec) { int i; @@ -131,7 +154,6 @@ goto out; ret = 0; - hugetlb_prefault_arch_hook(vma->vm_mm); if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: @@ -169,7 +191,7 @@ addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vm_start_gap(vma))) + (!vma || addr + len <= vma->vm_start)) return addr; } @@ -183,42 +205,33 @@ } #endif -static int +static size_t hugetlbfs_read_actor(struct page *page, unsigned long offset, - char __user *buf, unsigned long count, - unsigned long size) + struct iov_iter *to, unsigned long size) { - char *kaddr; - unsigned long left, copied = 0; + size_t copied = 0; int i, chunksize; - if (size > count) - size = count; - /* Find which 4k chunk and offset with in that chunk */ i = offset >> PAGE_CACHE_SHIFT; offset = offset & ~PAGE_CACHE_MASK; while (size) { + size_t n; chunksize = PAGE_CACHE_SIZE; if (offset) chunksize -= offset; if (chunksize > size) chunksize = size; - kaddr = kmap(&page[i]); - left = __copy_to_user(buf, kaddr + offset, chunksize); - kunmap(&page[i]); - if (left) { - copied += (chunksize - left); - break; - } + n = copy_page_to_iter(&page[i], offset, chunksize, to); + copied += n; + if (n != chunksize) + return copied; offset = 0; size -= chunksize; - buf += chunksize; - copied += chunksize; i++; } - return copied ? copied : -EFAULT; + return copied; } /* @@ -226,39 +239,34 @@ * data. Its *very* similar to do_generic_mapping_read(), we can't use that * since it has PAGE_CACHE_SIZE assumptions. */ -static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, - size_t len, loff_t *ppos) +static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct hstate *h = hstate_file(filp); - struct address_space *mapping = filp->f_mapping; + struct file *file = iocb->ki_filp; + struct hstate *h = hstate_file(file); + struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - unsigned long index = *ppos >> huge_page_shift(h); - unsigned long offset = *ppos & ~huge_page_mask(h); + unsigned long index = iocb->ki_pos >> huge_page_shift(h); + unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); unsigned long end_index; loff_t isize; ssize_t retval = 0; - /* validate length */ - if (len == 0) - goto out; - - for (;;) { + while (iov_iter_count(to)) { struct page *page; - unsigned long nr, ret; - int ra; + size_t nr, copied; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); isize = i_size_read(inode); if (!isize) - goto out; + break; end_index = (isize - 1) >> huge_page_shift(h); - if (index >= end_index) { - if (index > end_index) - goto out; + if (index > end_index) + break; + if (index == end_index) { nr = ((isize - 1) & ~huge_page_mask(h)) + 1; if (nr <= offset) - goto out; + break; } nr = nr - offset; @@ -269,39 +277,27 @@ * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ - ret = len < nr ? len : nr; - if (clear_user(buf, ret)) - ra = -EFAULT; - else - ra = 0; + copied = iov_iter_zero(nr, to); } else { unlock_page(page); /* * We have the page, copy it to user space buffer. */ - ra = hugetlbfs_read_actor(page, offset, buf, len, nr); - ret = ra; + copied = hugetlbfs_read_actor(page, offset, to, nr); page_cache_release(page); } - if (ra < 0) { - if (retval == 0) - retval = ra; - goto out; + offset += copied; + retval += copied; + if (copied != nr && iov_iter_count(to)) { + if (!retval) + retval = -EFAULT; + break; } - - offset += ret; - retval += ret; - len -= ret; index += offset >> huge_page_shift(h); offset &= ~huge_page_mask(h); - - /* short read or no more work */ - if ((ret != nr) || (len == 0)) - break; } -out: - *ppos = ((loff_t)index << huge_page_shift(h)) + offset; + iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; return retval; } @@ -321,62 +317,153 @@ return -EINVAL; } -static void truncate_huge_page(struct page *page) +static void remove_huge_page(struct page *page) { - cancel_dirty_page(page, /* No IO accounting for huge pages? */0); + ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } -static void truncate_hugepages(struct inode *inode, loff_t lstart) + +/* + * remove_inode_hugepages handles two distinct cases: truncation and hole + * punch. There are subtle differences in operation for each case. + + * truncation is indicated by end of range being LLONG_MAX + * In this case, we first scan the range and release found pages. + * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv + * maps and global counts. Page faults can not race with truncation + * in this routine. hugetlb_no_page() prevents page faults in the + * truncated range. It checks i_size before allocation, and again after + * with the page table lock for the page held. The same lock must be + * acquired to unmap a page. + * hole punch is indicated if end is not LLONG_MAX + * In the hole punch case we scan the range and release found pages. + * Only when releasing a page is the associated region/reserv map + * deleted. The region/reserv map for ranges without associated + * pages are not modified. Page faults can race with hole punch. + * This is indicated if we find a mapped page. + * Note: If the passed end of range value is beyond the end of file, but + * not LLONG_MAX this routine still performs a hole punch operation. + */ +static void remove_inode_hugepages(struct inode *inode, loff_t lstart, + loff_t lend) { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> huge_page_shift(h); + const pgoff_t end = lend >> huge_page_shift(h); + struct vm_area_struct pseudo_vma; struct pagevec pvec; pgoff_t next; int i, freed = 0; + long lookup_nr = PAGEVEC_SIZE; + bool truncate_op = (lend == LLONG_MAX); + memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec, 0); next = start; - while (1) { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - if (next == start) - break; - next = start; - continue; - } + while (next < end) { + /* + * Don't grab more pages than the number left in the range. + */ + if (end - next < lookup_nr) + lookup_nr = end - next; + + /* + * When no more pages are found, we are done. + */ + if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) + break; for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; + u32 hash; + + /* + * The page (index) could be beyond end. This is + * only possible in the punch hole case as end is + * max page offset in the truncate case. + */ + next = page->index; + if (next >= end) + break; + + hash = hugetlb_fault_mutex_hash(h, current->mm, + &pseudo_vma, + mapping, next, 0); + mutex_lock(&hugetlb_fault_mutex_table[hash]); lock_page(page); - if (page->index > next) - next = page->index; - ++next; - truncate_huge_page(page); + if (likely(!page_mapped(page))) { + bool rsv_on_error = !PagePrivate(page); + /* + * We must free the huge page and remove + * from page cache (remove_huge_page) BEFORE + * removing the region/reserve map + * (hugetlb_unreserve_pages). In rare out + * of memory conditions, removal of the + * region/reserve map could fail. Before + * free'ing the page, note PagePrivate which + * is used in case of error. + */ + remove_huge_page(page); + freed++; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages( + inode, next, + next + 1, 1))) + hugetlb_fix_reserve_counts( + inode, rsv_on_error); + } + } else { + /* + * If page is mapped, it was faulted in after + * being unmapped. It indicates a race between + * hole punch and page fault. Do nothing in + * this case. Getting here in a truncate + * operation is a bug. + */ + BUG_ON(truncate_op); + } + unlock_page(page); - freed++; + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } + ++next; huge_pagevec_release(&pvec); + cond_resched(); } - BUG_ON(!lstart && mapping->nrpages); - hugetlb_unreserve_pages(inode, start, freed); + + if (truncate_op) + (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) { - truncate_hugepages(inode, 0); + struct resv_map *resv_map; + + remove_inode_hugepages(inode, 0, LLONG_MAX); + resv_map = (struct resv_map *)inode->i_mapping->private_data; + /* root inode doesn't have the resv_map, so we should check it */ + if (resv_map) + resv_map_release(&resv_map->refs); clear_inode(inode); } static inline void -hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) { struct vm_area_struct *vma; - vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { + /* + * end == 0 indicates that the entire range after + * start should be unmapped. + */ + vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { unsigned long v_offset; + unsigned long v_end; /* * Can the expression below overflow on 32-bit arches? @@ -384,13 +471,22 @@ * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ - if (vma->vm_pgoff < pgoff) - v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; + if (vma->vm_pgoff < start) + v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; else v_offset = 0; - unmap_hugepage_range(vma, vma->vm_start + v_offset, - vma->vm_end, NULL); + if (!end) + v_end = vma->vm_end; + else { + v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + + vma->vm_start; + if (v_end > vma->vm_end) + v_end = vma->vm_end; + } + + unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, + NULL); } } @@ -404,17 +500,166 @@ pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) - hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - mutex_unlock(&mapping->i_mmap_mutex); - truncate_hugepages(inode, offset); + hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); + i_mmap_unlock_write(mapping); + remove_inode_hugepages(inode, offset, LLONG_MAX); return 0; } +static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct hstate *h = hstate_inode(inode); + loff_t hpage_size = huge_page_size(h); + loff_t hole_start, hole_end; + + /* + * For hole punch round up the beginning offset of the hole and + * round down the end. + */ + hole_start = round_up(offset, hpage_size); + hole_end = round_down(offset + len, hpage_size); + + if (hole_end > hole_start) { + struct address_space *mapping = inode->i_mapping; + + mutex_lock(&inode->i_mutex); + i_mmap_lock_write(mapping); + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + hugetlb_vmdelete_list(&mapping->i_mmap, + hole_start >> PAGE_SHIFT, + hole_end >> PAGE_SHIFT); + i_mmap_unlock_write(mapping); + remove_inode_hugepages(inode, hole_start, hole_end); + mutex_unlock(&inode->i_mutex); + } + + return 0; +} + +static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct hstate *h = hstate_inode(inode); + struct vm_area_struct pseudo_vma; + struct mm_struct *mm = current->mm; + loff_t hpage_size = huge_page_size(h); + unsigned long hpage_shift = huge_page_shift(h); + pgoff_t start, index, end; + int error; + u32 hash; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return hugetlbfs_punch_hole(inode, offset, len); + + /* + * Default preallocate case. + * For this range, start is rounded down and end is rounded up + * as well as being converted to page offsets. + */ + start = offset >> hpage_shift; + end = (offset + len + hpage_size - 1) >> hpage_shift; + + mutex_lock(&inode->i_mutex); + + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + /* + * Initialize a pseudo vma as this is required by the huge page + * allocation routines. If NUMA is configured, use page index + * as input to create an allocation policy. + */ + memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); + pseudo_vma.vm_file = file; + + for (index = start; index < end; index++) { + /* + * This is supposed to be the vaddr where the page is being + * faulted in, but we have no vaddr here. + */ + struct page *page; + unsigned long addr; + int avoid_reserve = 0; + + cond_resched(); + + /* + * fallocate(2) manpage permits EINTR; we may have been + * interrupted because we are using up too much memory. + */ + if (signal_pending(current)) { + error = -EINTR; + break; + } + + /* Set numa allocation policy based on index */ + hugetlb_set_vma_policy(&pseudo_vma, inode, index); + + /* addr is the offset within the file (zero based) */ + addr = index * hpage_size; + + /* mutex taken here, fault path and hole punch */ + hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, + index, addr); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + /* See if already present in mapping to avoid alloc/free */ + page = find_get_page(mapping, index); + if (page) { + put_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + hugetlb_drop_vma_policy(&pseudo_vma); + continue; + } + + /* Allocate page and add to page cache */ + page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve); + hugetlb_drop_vma_policy(&pseudo_vma); + if (IS_ERR(page)) { + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + error = PTR_ERR(page); + goto out; + } + clear_huge_page(page, addr, pages_per_huge_page(h)); + __SetPageUptodate(page); + error = huge_add_to_page_cache(page, mapping, index); + if (unlikely(error)) { + put_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out; + } + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + /* + * page_put due to reference from alloc_huge_page() + * unlock_page because locked by add_to_page_cache() + */ + put_page(page); + unlock_page(page); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); + inode->i_ctime = CURRENT_TIME; +out: + mutex_unlock(&inode->i_mutex); + return error; +} + static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; @@ -463,21 +708,35 @@ return inode; } +/* + * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never + * be taken from reclaim -- unlike regular filesystems. This needs an + * annotation because huge_pmd_share() does an allocation under + * i_mmap_rwsem. + */ +static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; + static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev) { struct inode *inode; + struct resv_map *resv_map; + + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; inode = new_inode(sb); if (inode) { struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); + lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, + &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; - inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - INIT_LIST_HEAD(&inode->i_mapping->private_list); + inode->i_mapping->private_data = resv_map; info = HUGETLBFS_I(inode); /* * The policy is initialized here even if we are creating a @@ -507,7 +766,9 @@ break; } lockdep_annotate_inode_mutex_key(inode); - } + } else + kref_put(&resv_map->refs, resv_map_release); + return inode; } @@ -592,7 +853,7 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); - struct hstate *h = hstate_inode(dentry->d_inode); + struct hstate *h = hstate_inode(d_inode(dentry)); buf->f_type = HUGETLBFS_MAGIC; buf->f_bsize = huge_page_size(h); @@ -703,11 +964,12 @@ } const struct file_operations hugetlbfs_file_operations = { - .read = hugetlbfs_read, + .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, - .llseek = default_llseek, + .llseek = default_llseek, + .fallocate = hugetlbfs_fallocate, }; static const struct inode_operations hugetlbfs_dir_inode_operations = { @@ -736,14 +998,38 @@ .show_options = generic_show_options, }; +enum { NO_SIZE, SIZE_STD, SIZE_PERCENT }; + +/* + * Convert size option passed from command line to number of huge pages + * in the pool specified by hstate. Size option could be in bytes + * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). + */ +static long long +hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, + int val_type) +{ + if (val_type == NO_SIZE) + return -1; + + if (val_type == SIZE_PERCENT) { + size_opt <<= huge_page_shift(h); + size_opt *= h->max_huge_pages; + do_div(size_opt, 100); + } + + size_opt >>= huge_page_shift(h); + return size_opt; +} + static int hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) { char *p, *rest; substring_t args[MAX_OPT_ARGS]; int option; - unsigned long long size = 0; - enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; + unsigned long long max_size_opt = 0, min_size_opt = 0; + int max_val_type = NO_SIZE, min_val_type = NO_SIZE; if (!options) return 0; @@ -781,10 +1067,10 @@ /* memparse() will accept a K/M/G without a digit */ if (!isdigit(*args[0].from)) goto bad_val; - size = memparse(args[0].from, &rest); - setsize = SIZE_STD; + max_size_opt = memparse(args[0].from, &rest); + max_val_type = SIZE_STD; if (*rest == '%') - setsize = SIZE_PERCENT; + max_val_type = SIZE_PERCENT; break; } @@ -800,38 +1086,53 @@ ps = memparse(args[0].from, &rest); pconfig->hstate = size_to_hstate(ps); if (!pconfig->hstate) { - printk(KERN_ERR - "hugetlbfs: Unsupported page size %lu MB\n", + pr_err("Unsupported page size %lu MB\n", ps >> 20); return -EINVAL; } break; } + case Opt_min_size: { + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + min_size_opt = memparse(args[0].from, &rest); + min_val_type = SIZE_STD; + if (*rest == '%') + min_val_type = SIZE_PERCENT; + break; + } + default: - printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", - p); + pr_err("Bad mount option: \"%s\"\n", p); return -EINVAL; break; } } - /* Do size after hstate is set up */ - if (setsize > NO_SIZE) { - struct hstate *h = pconfig->hstate; - if (setsize == SIZE_PERCENT) { - size <<= huge_page_shift(h); - size *= h->max_huge_pages; - do_div(size, 100); - } - pconfig->nr_blocks = (size >> huge_page_shift(h)); + /* + * Use huge page pool size (in hstate) to convert the size + * options to number of huge pages. If NO_SIZE, -1 is returned. + */ + pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + max_size_opt, max_val_type); + pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + min_size_opt, min_val_type); + + /* + * If max_size was specified, then min_size must be smaller + */ + if (max_val_type > NO_SIZE && + pconfig->min_hpages > pconfig->max_hpages) { + pr_err("minimum size can not be greater than maximum size\n"); + return -EINVAL; } return 0; bad_val: - printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", - args[0].from, p); + pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); return -EINVAL; } @@ -844,12 +1145,13 @@ save_mount_options(sb, data); - config.nr_blocks = -1; /* No limit on size by default */ + config.max_hpages = -1; /* No limit on size by default */ config.nr_inodes = -1; /* No limit on number of inodes by default */ config.uid = current_fsuid(); config.gid = current_fsgid(); config.mode = 0755; config.hstate = &default_hstate; + config.min_hpages = -1; /* No default minimum size */ ret = hugetlbfs_parse_options(data, &config); if (ret) return ret; @@ -863,8 +1165,15 @@ sbinfo->max_inodes = config.nr_inodes; sbinfo->free_inodes = config.nr_inodes; sbinfo->spool = NULL; - if (config.nr_blocks != -1) { - sbinfo->spool = hugepage_new_subpool(config.nr_blocks); + /* + * Allocate and initialize subpool if maximum or minimum size is + * specified. Any needed reservations (for minimim size) are taken + * taken when the subpool is created. + */ + if (config.max_hpages != -1 || config.min_hpages != -1) { + sbinfo->spool = hugepage_new_subpool(config.hstate, + config.max_hpages, + config.min_hpages); if (!sbinfo->spool) goto out_free; } @@ -879,8 +1188,7 @@ goto out_free; return 0; out_free: - if (sbinfo->spool) - kfree(sbinfo->spool); + kfree(sbinfo->spool); kfree(sbinfo); return -ENOMEM; } @@ -916,7 +1224,7 @@ return h - hstates; } -static struct dentry_operations anon_ops = { +static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; @@ -947,8 +1255,7 @@ *user = current_user(); if (user_shm_lock(size, *user)) { task_lock(current); - printk_once(KERN_WARNING - "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", + pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", current->comm, current->pid); task_unlock(current); } else { @@ -971,6 +1278,8 @@ inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out_dentry; + if (creat_flags == HUGETLB_SHMFS_INODE) + inode->i_flags |= S_PRIVATE; file = ERR_PTR(-ENOMEM); if (hugetlb_reserve_pages(inode, 0, @@ -1007,9 +1316,10 @@ int error; int i; - error = bdi_init(&hugetlbfs_backing_dev_info); - if (error) - return error; + if (!hugepages_supported()) { + pr_info("disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", @@ -1032,7 +1342,7 @@ buf); if (IS_ERR(hugetlbfs_vfsmount[i])) { - pr_err("hugetlb: Cannot mount internal hugetlbfs for " + pr_err("Cannot mount internal hugetlbfs for " "page size %uK", ps_kb); error = PTR_ERR(hugetlbfs_vfsmount[i]); hugetlbfs_vfsmount[i] = NULL; @@ -1046,7 +1356,6 @@ out: kmem_cache_destroy(hugetlbfs_inode_cachep); out2: - bdi_destroy(&hugetlbfs_backing_dev_info); return error; } @@ -1066,7 +1375,6 @@ for_each_hstate(h) kern_unmount(hugetlbfs_vfsmount[i++]); unregister_filesystem(&hugetlbfs_fs_type); - bdi_destroy(&hugetlbfs_backing_dev_info); } module_init(init_hugetlbfs_fs)