--- zzzz-none-000/linux-3.10.107/arch/x86/mm/init_64.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/arch/x86/mm/init_64.c 2021-02-04 17:41:59.000000000 +0000 @@ -52,7 +52,6 @@ #include #include #include -#include #include #include "mm_internal.h" @@ -131,27 +130,13 @@ return 0; } -static int __init parse_direct_gbpages_off(char *arg) -{ - direct_gbpages = 0; - return 0; -} -early_param("nogbpages", parse_direct_gbpages_off); - -static int __init parse_direct_gbpages_on(char *arg) -{ - direct_gbpages = 1; - return 0; -} -early_param("gbpages", parse_direct_gbpages_on); - /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move * around without checking the pgd every time. */ -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; +pteval_t __supported_pte_mask __read_mostly = ~0; EXPORT_SYMBOL_GPL(__supported_pte_mask); int force_personality32; @@ -178,7 +163,7 @@ * When memory was added/removed make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ -void sync_global_pgds(unsigned long start, unsigned long end) +void sync_global_pgds(unsigned long start, unsigned long end, int removed) { unsigned long address; @@ -186,7 +171,12 @@ const pgd_t *pgd_ref = pgd_offset_k(address); struct page *page; - if (pgd_none(*pgd_ref)) + /* + * When it is called after memory hot remove, pgd_none() + * returns true. In this case (removed == 1), we must clear + * the PGD entries in the local PGD level page. + */ + if (pgd_none(*pgd_ref) && !removed) continue; spin_lock(&pgd_lock); @@ -199,12 +189,18 @@ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else + if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + if (removed) { + if (pgd_none(*pgd_ref) && !pgd_none(*pgd)) + pgd_clear(pgd); + } else { + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + } + spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); @@ -327,12 +323,15 @@ * Create large page table mappings for a range of physical addresses. */ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, - pgprot_t prot) + enum page_cache_mode cache) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; + pgprot_t prot; + pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | + pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache))); BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); @@ -355,12 +354,12 @@ void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) { - __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB); } void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) { - __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC); } /* @@ -368,7 +367,7 @@ * * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) * - * phys_addr holds the negative offset to the kernel, which is added + * phys_base holds the negative offset to the kernel, which is added * to the compile time generated pmds. This results in invalid pmds up * to the point where we hit the physaddr 0 mapping. * @@ -633,7 +632,7 @@ } if (pgd_changed) - sync_global_pgds(addr, end - 1); + sync_global_pgds(addr, end - 1, 0); __flush_tlb_all(); @@ -643,7 +642,7 @@ #ifndef CONFIG_NUMA void __init initmem_init(void) { - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); } #endif @@ -688,10 +687,11 @@ * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + ZONE_NORMAL; + struct zone *zone = pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; @@ -712,36 +712,22 @@ static void __meminit free_pagetable(struct page *page, int order) { - struct zone *zone; - bool bootmem = false; unsigned long magic; unsigned int nr_pages = 1 << order; /* bootmem page has reserved flag */ if (PageReserved(page)) { __ClearPageReserved(page); - bootmem = true; magic = (unsigned long)page->lru.next; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); } else - __free_pages_bootmem(page, order); + while (nr_pages--) + free_reserved_page(page++); } else free_pages((unsigned long)page_address(page), order); - - /* - * SECTION_INFO pages and MIX_SECTION_INFO pages - * are all allocated by bootmem. - */ - if (bootmem) { - zone = page_zone(page); - zone_span_writelock(zone); - zone->present_pages += nr_pages; - zone_span_writeunlock(zone); - totalram_pages += nr_pages; - } } static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) @@ -989,25 +975,26 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) { unsigned long next; + unsigned long addr; pgd_t *pgd; pud_t *pud; bool pgd_changed = false; - for (; start < end; start = next) { - next = pgd_addr_end(start, end); + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); - pgd = pgd_offset_k(start); + pgd = pgd_offset_k(addr); if (!pgd_present(*pgd)) continue; pud = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud, start, next, direct); + remove_pud_table(pud, addr, next, direct); if (free_pud_table(pud, pgd)) pgd_changed = true; } if (pgd_changed) - sync_global_pgds(start, end - 1); + sync_global_pgds(start, end - 1, 1); flush_tlb_all(); } @@ -1058,9 +1045,6 @@ void __init mem_init(void) { - long codesize, reservedpages, datasize, initsize; - unsigned long absent_pages; - pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ @@ -1068,29 +1052,14 @@ register_page_bootmem_info(); /* this will put all memory onto the freelists */ - totalram_pages = free_all_bootmem(); - - absent_pages = absent_pages_in_range(0, max_pfn); - reservedpages = max_pfn - totalram_pages - absent_pages; + free_all_bootmem(); after_bootmem = 1; - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, + PAGE_SIZE, KCORE_OTHER); - printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " - "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", - nr_free_pages() << (PAGE_SHIFT-10), - max_pfn << (PAGE_SHIFT-10), - codesize >> 10, - absent_pages << (PAGE_SHIFT-10), - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10); + mem_init_print_info(NULL); } #ifdef CONFIG_DEBUG_RODATA @@ -1175,13 +1144,14 @@ set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_init_pages("unused kernel memory", + free_init_pages("unused kernel", (unsigned long) __va(__pa_symbol(text_end)), (unsigned long) __va(__pa_symbol(rodata_start))); - - free_init_pages("unused kernel memory", + free_init_pages("unused kernel", (unsigned long) __va(__pa_symbol(rodata_end)), (unsigned long) __va(__pa_symbol(_sdata))); + + debug_checkwx(); } #endif @@ -1222,66 +1192,40 @@ return pfn_valid(pte_pfn(*pte)); } -/* - * A pseudo VMA to allow ptrace access for the vsyscall page. This only - * covers the 64bit vsyscall page now. 32bit has a real VMA now and does - * not need special handling anymore: - */ -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC -}; - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (!mm || mm->context.ia32_compat) - return NULL; -#endif - return &gate_vma; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) +static unsigned long probe_memory_block_size(void) { - struct vm_area_struct *vma = get_gate_vma(mm); + /* start from 2g */ + unsigned long bz = 1UL<<31; - if (!vma) - return 0; + if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { + pr_info("Using 2GB memory block size for large-memory system\n"); + return 2UL * 1024 * 1024 * 1024; + } - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} + /* less than 64g installed */ + if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) + return MIN_MEMORY_BLOCK_SIZE; -/* - * Use this when you have no reliable mm, typically from interrupt - * context. It is less reliable than using a task's mm and may give - * false positives. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); -} + /* get the tail size */ + while (bz > MIN_MEMORY_BLOCK_SIZE) { + if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) + break; + bz >>= 1; + } -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - if (vma == &gate_vma) - return "[vsyscall]"; - return NULL; + printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + + return bz; } -#ifdef CONFIG_X86_UV +static unsigned long memory_block_size_probed; unsigned long memory_block_size_bytes(void) { - if (is_uv_system()) { - printk(KERN_INFO "UV: memory block size 2GB\n"); - return 2UL * 1024 * 1024 * 1024; - } - return MIN_MEMORY_BLOCK_SIZE; + if (!memory_block_size_probed) + memory_block_size_probed = probe_memory_block_size(); + + return memory_block_size_probed; } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP /* @@ -1326,7 +1270,7 @@ /* check to see if we have contiguous blocks */ if (p_end != p || node_start != node) { if (p_start) - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", addr_start, addr_end-1, p_start, p_end-1, node_start); addr_start = addr; node_start = node; @@ -1357,7 +1301,7 @@ else err = vmemmap_populate_basepages(start, end, node); if (!err) - sync_global_pgds(start, end - 1); + sync_global_pgds(start, end - 1, 0); return err; } @@ -1424,7 +1368,7 @@ void __meminit vmemmap_populate_print_last(void) { if (p_start) { - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", addr_start, addr_end-1, p_start, p_end-1, node_start); p_start = NULL; p_end = NULL;