From: Tejun Heo Date: Wed, 16 Feb 2011 16:11:10 +0000 (+0100) Subject: x86-64, NUMA: Make emulation code build numa_meminfo and share the registration path X-Git-Tag: v2.6.39-rc1~499^2~24 X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=c88aea7a70b0f014f98c695069ba91abc9e9b9a4;p=~emulex%2Finfiniband.git x86-64, NUMA: Make emulation code build numa_meminfo and share the registration path NUMA emulation code built nodes[] array and had its own registration path to set up the emulated nodes. Update it such that it generates emulated numa_meminfo and returns control to initmem_init() and shares the same registration path with non-emulated cases. Because {acpi|amd}_fake_nodes() expect nodes[] parameter, fake_physnodes() now generates nodes[] from numa_meminfo. This will go away with further updates. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: Ingo Molnar Cc: H. Peter Anvin --- diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index e9919c4d157..9736204337b 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -541,7 +541,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode physnodes[MAX_NUMNODES] __initdata; static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; @@ -626,9 +625,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end) return ret; } -static void __init fake_physnodes(int acpi, int amd, int nr_nodes) +static void __init fake_physnodes(int acpi, int amd, + const struct numa_meminfo *ei) { - int i; + static struct bootnode nodes[MAX_NUMNODES] __initdata; + int i, nr_nodes = 0; + + for (i = 0; i < ei->nr_blks; i++) { + int nid = ei->blk[i].nid; + + if (nodes[nid].start == nodes[nid].end) { + nodes[nid].start = ei->blk[i].start; + nodes[nid].end = ei->blk[i].end; + nr_nodes++; + } else { + nodes[nid].start = min(ei->blk[i].start, nodes[nid].start); + nodes[nid].end = max(ei->blk[i].end, nodes[nid].end); + } + } BUG_ON(acpi && amd); #ifdef CONFIG_ACPI_NUMA @@ -645,45 +659,44 @@ static void __init fake_physnodes(int acpi, int amd, int nr_nodes) } /* - * Setups up nid to range from addr to addr + size. If the end - * boundary is greater than max_addr, then max_addr is used instead. - * The return value is 0 if there is additional memory left for - * allocation past addr and -1 otherwise. addr is adjusted to be at - * the end of the node. + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. */ -static int __init setup_node_range(int nid, int physnid, - u64 *addr, u64 size, u64 max_addr) +static int __init emu_setup_memblk(struct numa_meminfo *ei, + int nid, int physnid, u64 start, u64 end) { - int ret = 0; - nodes[nid].start = *addr; - *addr += size; - if (*addr >= max_addr) { - *addr = max_addr; - ret = -1; + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + + if (ei->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; } - nodes[nid].end = *addr; - node_set(nid, node_possible_map); + + ei->nr_blks++; + eb->start = start; + eb->end = end; + eb->nid = nid; if (emu_nid_to_phys[nid] == NUMA_NO_NODE) emu_nid_to_phys[nid] = physnid; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, - nodes[nid].start, nodes[nid].end, - (nodes[nid].end - nodes[nid].start) >> 20); - return ret; + eb->start, eb->end, (eb->end - eb->start) >> 20); + return 0; } /* * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr * to max_addr. The return value is the number of nodes allocated. */ -static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) +static int __init split_nodes_interleave(struct numa_meminfo *ei, + u64 addr, u64 max_addr, int nr_nodes) { nodemask_t physnode_mask = NODE_MASK_NONE; u64 size; int big; - int ret = 0; - int i; + int nid = 0; + int i, ret; if (nr_nodes <= 0) return -1; @@ -721,7 +734,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) u64 end = physnodes[i].start + size; u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); - if (ret < big) + if (nid < big) end += FAKE_NODE_MIN_SIZE; /* @@ -760,16 +773,21 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) * happen as a result of rounding down each node's size * to FAKE_NODE_MIN_SIZE. */ - if (nodes_weight(physnode_mask) + ret >= nr_nodes) + if (nodes_weight(physnode_mask) + nid >= nr_nodes) end = physnodes[i].end; - if (setup_node_range(ret++, i, &physnodes[i].start, - end - physnodes[i].start, - physnodes[i].end) < 0) + ret = emu_setup_memblk(ei, nid++, i, + physnodes[i].start, + min(end, physnodes[i].end)); + if (ret < 0) + return ret; + + physnodes[i].start = min(end, physnodes[i].end); + if (physnodes[i].start == physnodes[i].end) node_clear(i, physnode_mask); } } - return ret; + return 0; } /* @@ -794,12 +812,13 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) * Sets up fake nodes of `size' interleaved over physical nodes ranging from * `addr' to `max_addr'. The return value is the number of nodes allocated. */ -static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + u64 addr, u64 max_addr, u64 size) { nodemask_t physnode_mask = NODE_MASK_NONE; u64 min_size; - int ret = 0; - int i; + int nid = 0; + int i, ret; if (!size) return -1; @@ -854,30 +873,31 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) memblock_x86_hole_size(end, physnodes[i].end) < size) end = physnodes[i].end; - /* - * Setup the fake node that will be allocated as bootmem - * later. If setup_node_range() returns non-zero, there - * is no more memory available on this physical node. - */ - if (setup_node_range(ret++, i, &physnodes[i].start, - end - physnodes[i].start, - physnodes[i].end) < 0) + ret = emu_setup_memblk(ei, nid++, i, + physnodes[i].start, + min(end, physnodes[i].end)); + if (ret < 0) + return ret; + + physnodes[i].start = min(end, physnodes[i].end); + if (physnodes[i].start == physnodes[i].end) node_clear(i, physnode_mask); } } - return ret; + return 0; } /* * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ -static int __init numa_emulation(int acpi, int amd) +static bool __init numa_emulation(int acpi, int amd) { static struct numa_meminfo ei __initdata; const u64 max_addr = max_pfn << PAGE_SHIFT; - int num_nodes; - int i; + int i, ret; + + memset(&ei, 0, sizeof(ei)); for (i = 0; i < MAX_NUMNODES; i++) emu_nid_to_phys[i] = NUMA_NO_NODE; @@ -891,52 +911,33 @@ static int __init numa_emulation(int acpi, int amd) u64 size; size = memparse(emu_cmdline, &emu_cmdline); - num_nodes = split_nodes_size_interleave(0, max_addr, size); + ret = split_nodes_size_interleave(&ei, 0, max_addr, size); } else { unsigned long n; n = simple_strtoul(emu_cmdline, NULL, 0); - num_nodes = split_nodes_interleave(0, max_addr, n); + ret = split_nodes_interleave(&ei, 0, max_addr, n); + } + + if (ret < 0) + return false; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + return false; } - if (num_nodes < 0) - return num_nodes; + /* commit */ + numa_meminfo = ei; /* make sure all emulated nodes are mapped to a physical node */ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) if (emu_nid_to_phys[i] == NUMA_NO_NODE) emu_nid_to_phys[i] = 0; - ei.nr_blks = num_nodes; - for (i = 0; i < ei.nr_blks; i++) { - ei.blk[i].start = nodes[i].start; - ei.blk[i].end = nodes[i].end; - ei.blk[i].nid = i; - } - - memnode_shift = compute_hash_shift(&ei); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. NUMA emulation " - "disabled.\n"); - return -1; - } - - /* - * We need to vacate all active ranges that may have been registered for - * the e820 memory map. - */ - remove_all_active_ranges(); - for_each_node_mask(i, node_possible_map) - memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); - init_memory_mapping_high(); - for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - fake_physnodes(acpi, amd, num_nodes); - numa_init_array(); + fake_physnodes(acpi, amd, &ei); numa_emu_dist = true; - return 0; + return true; } #endif /* CONFIG_NUMA_EMU */ @@ -988,15 +989,13 @@ void __init initmem_init(void) continue; #ifdef CONFIG_NUMA_EMU setup_physnodes(0, max_pfn << PAGE_SHIFT); - if (emu_cmdline && !numa_emulation(i == 0, i == 1)) - return; - - /* not emulating, build identity mapping for numa_add_cpu() */ - for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) - emu_nid_to_phys[j] = j; - - nodes_clear(node_possible_map); - nodes_clear(node_online_map); + /* + * If requested, try emulation. If emulation is not used, + * build identity emu_nid_to_phys[] for numa_add_cpu() + */ + if (!emu_cmdline || !numa_emulation(i == 0, i == 1)) + for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + emu_nid_to_phys[j] = j; #endif if (numa_register_memblks(&numa_meminfo) < 0) continue;