【Linux内存源码分析】建立内核页表(3)

前面已经分析了内核页表的准备工作以及内核低端内存页表的建立,接着回到init_mem_mapping()中,低端内存页表建立后紧随着还有一个函数early_ioremap_page_table_range_init()

【file:/arch/x86/mm/init.c】
/*
 * Build a proper pagetable for the kernel mappings.  Up until this
 * point, we've been running on some set of pagetables constructed by
 * the boot process.
 *
 * If we're booting on native hardware, this will be a pagetable
 * constructed in arch/x86/kernel/head_32.S.  The root of the
 * pagetable will be swapper_pg_dir.
 *
 * If we're booting paravirtualized under a hypervisor, then there are
 * more options: we may already be running PAE, and the pagetable may
 * or may not be based in swapper_pg_dir.  In any case,
 * paravirt_pagetable_init() will set up swapper_pg_dir
 * appropriately for the rest of the initialization to work.
 *
 * In general, pagetable_init() assumes that the pagetable may already
 * be partially populated, and so it avoids stomping on any existing
 * mappings.
 */
void __init early_ioremap_page_table_range_init(void)
{
	pgd_t *pgd_base = swapper_pg_dir;
	unsigned long vaddr, end;

	/*
	 * Fixed mappings, only the page table structure has to be
	 * created - mappings will be set by set_fixmap():
	 */
	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
	page_table_range_init(vaddr, end, pgd_base);
	early_ioremap_reset();
}

该函数主要是用于建立固定内存映射区的。固定内存映射区是指FIXADDR_STARTFIXADDR_TOP的地址空间,而该地址空间因功能特性不同通过索引来定义区分,其中索引以枚举类型的形式定义在enum fixed_addresses里面。

【file:/arch/x86/include/asm/fixmap.h】
/*
 * Here we define all the compile-time 'special' virtual
 * addresses. The point is to have a constant address at
 * compile time, but to set the physical address only
 * in the boot process.
 * for x86_32: We allocate these special addresses
 * from the end of virtual memory (0xfffff000) backwards.
 * Also this lets us do fail-safe vmalloc(), we
 * can guarantee that these special addresses and
 * vmalloc()-ed addresses never overlap.
 *
 * These 'compile-time allocated' memory buffers are
 * fixed-size 4k pages (or larger if used with an increment
 * higher than 1). Use set_fixmap(idx,phys) to associate
 * physical memory with fixmap indices.
 *
 * TLB entries of such buffers will not be flushed across
 * task switches.
 */
enum fixed_addresses {
#ifdef CONFIG_X86_32
	FIX_HOLE,
	FIX_VDSO,
#else
	VSYSCALL_LAST_PAGE,
	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
	VVAR_PAGE,
	VSYSCALL_HPET,
#ifdef CONFIG_PARAVIRT_CLOCK
	PVCLOCK_FIXMAP_BEGIN,
	PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
#endif
#endif
	FIX_DBGP_BASE,
	FIX_EARLYCON_MEM_BASE,
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
	FIX_OHCI1394_BASE,
#endif
#ifdef CONFIG_X86_LOCAL_APIC
	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
#endif
#ifdef CONFIG_X86_IO_APIC
	FIX_IO_APIC_BASE_0,
	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
#ifdef CONFIG_X86_VISWS_APIC
	FIX_CO_CPU,	/* Cobalt timer */
	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
	FIX_LI_PCIA,	/* Lithium PCI Bridge A */
	FIX_LI_PCIB,	/* Lithium PCI Bridge B */
#endif
	FIX_RO_IDT,	/* Virtual mapping for read-only IDT */
#ifdef CONFIG_X86_32
	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
#ifdef CONFIG_PCI_MMCONFIG
	FIX_PCIE_MCFG,
#endif
#endif
#ifdef CONFIG_PARAVIRT
	FIX_PARAVIRT_BOOTMAP,
#endif
	FIX_TEXT_POKE1,	/* reserve 2 pages for text_poke() */
	FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
#ifdef	CONFIG_X86_INTEL_MID
	FIX_LNW_VRTC,
#endif
	__end_of_permanent_fixed_addresses,

	/*
	 * 256 temporary boot-time mappings, used by early_ioremap(),
	 * before ioremap() is functional.
	 *
	 * If necessary we round it up to the next 256 pages boundary so
	 * that we can have a single pgd entry and a single pte table:
	 */
#define NR_FIX_BTMAPS		64
#define FIX_BTMAPS_SLOTS	4
#define TOTAL_FIX_BTMAPS	(NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
	FIX_BTMAP_END =
	 (__end_of_permanent_fixed_addresses ^
	  (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
	 -PTRS_PER_PTE
	 ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
	   (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
	 : __end_of_permanent_fixed_addresses,
	FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
#ifdef CONFIG_X86_32
	FIX_WP_TEST,
#endif
#ifdef CONFIG_INTEL_TXT
	FIX_TBOOT_BASE,
#endif
	__end_of_fixed_addresses
};

但是各枚举标识的分区并不是从低地址往高地址分布,而是自高地址往低地址分布。其中__fix_to_virt宏定义就是用来通过索引来计算相应的固定映射区域的线性地址。

#define __fix_to_virt(x)        (FIXADDR_TOP – ((x) <<
PAGE_SHIFT))

对应的有虚拟地址转索引的宏:

#define __virt_to_fix(x)        ((FIXADDR_TOP – ((x)&PAGE_MASK))
>> PAGE_SHIFT)

接着回到early_ioremap_page_table_range_init()的第一个函数page_table_range_init()

【file:/arch/x86/mm/init_32.c】
/*
 * This function initializes a certain range of kernel virtual memory
 * with new bootmem page tables, everywhere page tables are missing in
 * the given range.
 *
 * NOTE: The pagetables are allocated contiguous on the physical space
 * so we can cache the place of the first one and move around without
 * checking the pgd every time.
 */
static void __init
page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
{
	int pgd_idx, pmd_idx;
	unsigned long vaddr;
	pgd_t *pgd;
	pmd_t *pmd;
	pte_t *pte = NULL;
	unsigned long count = page_table_range_init_count(start, end);
	void *adr = NULL;

	if (count)
		adr = alloc_low_pages(count);

	vaddr = start;
	pgd_idx = pgd_index(vaddr);
	pmd_idx = pmd_index(vaddr);
	pgd = pgd_base + pgd_idx;

	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
		pmd = one_md_table_init(pgd);
		pmd = pmd + pmd_index(vaddr);
		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
							pmd++, pmd_idx++) {
			pte = page_table_kmap_check(one_page_table_init(pmd),
						    pmd, vaddr, pte, &adr);

			vaddr += PMD_SIZE;
		}
		pmd_idx = 0;
	}
}

该函数里面其中调用的page_table_range_init_count()

【file:/arch/x86/mm/init_32.c】
static unsigned long __init
page_table_range_init_count(unsigned long start, unsigned long end)
{
	unsigned long count = 0;
#ifdef CONFIG_HIGHMEM
	int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
	int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
	int pgd_idx, pmd_idx;
	unsigned long vaddr;

	if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
		return 0;

	vaddr = start;
	pgd_idx = pgd_index(vaddr);

	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
							pmd_idx++) {
			if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
			    (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
				count++;
			vaddr += PMD_SIZE;
		}
		pmd_idx = 0;
	}
#endif
	return count;
}

page_table_range_init_count()用来计算指临时内核映射区间的页表数量。前面提到FIXADDR_STARTFIXADDR_TOP是固定映射区,其间有多个索引标识不同功能的映射区间,其中的一个区间FIX_KMAP_BEGINFIX_KMAP_END是临时内核映射区间。顺便可以看一下两者的定义:

    FIX_KMAP_BEGIN, /* reserved
pte’s for temporary kernel mappings */

    FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,

其中KM_TYPE_NR表示窗口数量,在高端内存的任意一个页框都可以通过一个窗口映射到内核地址空间,调用kmap_atomic可以搭建起窗口到高端内存的关系,即建立临时内核映射。而NR_CPUS则表示CPU数量。总的来说就是该临时内核映射区间是为了给各个CPU准备一个指定的窗口空间。由于kmap_atomic()对该区间的使用,所以该区间必须保证其页表连续性。

如果页全局目录数不为0的时候,紧接着page_table_range_init_count()的是alloc_low_pages()

【file:/arch/x86/mm/init.c】
/*
 * Pages returned are already directly mapped.
 *
 * Changing that is likely to break Xen, see commit:
 *
 *    279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
 *
 * for detailed information.
 */
__ref void *alloc_low_pages(unsigned int num)
{
	unsigned long pfn;
	int i;

	if (after_bootmem) {
		unsigned int order;

		order = get_order((unsigned long)num << PAGE_SHIFT);
		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
						__GFP_ZERO, order);
	}

	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
		unsigned long ret;
		if (min_pfn_mapped >= max_pfn_mapped)
			panic("alloc_low_pages: ran out of memory");
		ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
					max_pfn_mapped << PAGE_SHIFT,
					PAGE_SIZE * num , PAGE_SIZE);
		if (!ret)
			panic("alloc_low_pages: can not alloc memory");
		memblock_reserve(ret, PAGE_SIZE * num);
		pfn = ret >> PAGE_SHIFT;
	} else {
		pfn = pgt_buf_end;
		pgt_buf_end += num;
		printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
			pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
	}

	for (i = 0; i < num; i++) {
		void *adr;

		adr = __va((pfn + i) << PAGE_SHIFT);
		clear_page(adr);
	}

	return __va(pfn << PAGE_SHIFT);
}

则是根据前面early_alloc_pgt_buf()申请保留的页表缓冲空间使用情况来判断,是从页表缓冲空间中申请还是通过memblock算法申请页表内存。

回到page_table_range_init(),其中one_md_table_init()是用于当pgd入参为空时,申请新物理页作为页中间目录的,但是此次仅分析x86PAE环境的情况,不存在页中间目录,故实际上返回的仍是入参。附代码:

【file:/arch/x86/mm/init_32.c】
/*
 * Creates a middle page table and puts a pointer to it in the
 * given global directory entry. This only returns the gd entry
 * in non-PAE compilation mode, since the middle layer is folded.
 */
static pmd_t * __init one_md_table_init(pgd_t *pgd)
{
	pud_t *pud;
	pmd_t *pmd_table;

#ifdef CONFIG_X86_PAE
	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
		pmd_table = (pmd_t *)alloc_low_page();
		paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
		pud = pud_offset(pgd, 0);
		BUG_ON(pmd_table != pmd_offset(pud, 0));

		return pmd_table;
	}
#endif
	pud = pud_offset(pgd, 0);
	pmd_table = pmd_offset(pud, 0);

	return pmd_table;
}

接着的是page_table_kmap_check(),其入参调用的one_page_table_init()是用于当入参pmd没有页表指向时,创建页表并使其指向被创建的页表。page_table_kmap_check()实现:

【file:/arch/x86/mm/init_32.c】
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
					   unsigned long vaddr, pte_t *lastpte,
					   void **adr)
{
#ifdef CONFIG_HIGHMEM
	/*
	 * Something (early fixmap) may already have put a pte
	 * page here, which causes the page table allocation
	 * to become nonlinear. Attempt to fix it, and if it
	 * is still nonlinear then we have to bug.
	 */
	int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
	int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;

	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
		pte_t *newpte;
		int i;

		BUG_ON(after_bootmem);
		newpte = *adr;
		for (i = 0; i < PTRS_PER_PTE; i++)
			set_pte(newpte + i, pte[i]);
		*adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);

		paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
		set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
		BUG_ON(newpte != pte_offset_kernel(pmd, 0));
		__flush_tlb_all();

		paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
		pte = newpte;
	}
	BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
	       && vaddr > fix_to_virt(FIX_KMAP_END)
	       && lastpte && lastpte + PTRS_PER_PTE != pte);
#endif
	return pte;
}

可以看到这里在此出现临时内核映射区间的标识(FIX_KMAP_ENDFIX_KMAP_BEGIN),检查当前页表初始化的地址是否处于该区间范围,如果是,则把其pte页表的内容拷贝到page_table_range_init()申请的页表空间中,并将newpte新页表的地址设置到pmd中(32bit系统实际上就是页全局目录),然后调用__flush_tlb_all()刷新TLB缓存;如果不是该区间,则仅是由入参中调用的one_page_table_init()被分配到了页表空间。

由此,可以知道page_table_range_init()主要是做了什么了。这是由于kmap_atomic()对该区间的使用,该区间必须保证其页表连续性。为了避免前期可能对固定映射区已经分配了页表项,基于临时内核映射区间要求页表连续性的保证,所以在此重新申请连续的页表空间将原页表内容拷贝至此。值得注意的是,与低端内存的页表初始化不同的是,这里的页表只是被分配,相应的PTE项并未初始化,这个工作将会交由以后各个固定映射区部分的相关代码调用set_fixmap()来将相关的固定映射区页表与物理内存关联。

early_ioremap_page_table_range_init()函数再往下的early_ioremap_reset()仅是对after_paging_init全局变量赋值。

最后退出early_ioremap_page_table_range_init()后,init_mem_mapping()调用load_cr3()刷新CR3寄存器,__flush_tlb_all()则用于刷新TLB,由此启用新的内存分页映射。

    至此,内核页表建立完毕。

One reply

  1. Aly Chiman说道:

    Hello there,

    My name is Aly and I would like to know if you would have any interest to have your website here at jeanleo.com promoted as a resource on our blog alychidesign.com ?

    We are in the midst of updating our broken link resources to include current and up to date resources for our readers. Our resource links are manually approved allowing us to mark a link as a do-follow link as well
    .
    If you may be interested please in being included as a resource on our blog, please let me know.

    Thanks,
    Aly

发表评论

电子邮件地址不会被公开。 必填项已用*标注