From 6ec6e0d9f2fd7cb6ca6bc3bfab5ae7b5cdd8c36f Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 25 Mar 2008 10:14:35 -0700 Subject: [PATCH] srat, x86: add support for nodes spanning other nodes For example, If the physical address layout on a two node system with 8 GB memory is something like: node 0: 0-2GB, 4-6GB node 1: 2-4GB, 6-8GB Current kernels fail to boot/detect this NUMA topology. ACPI SRAT tables can expose such a topology which needs to be supported. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 9 +++++++++ arch/x86/mm/k8topology_64.c | 2 +- arch/x86/mm/numa_64.c | 16 +++++++++++----- arch/x86/mm/srat_64.c | 32 +++++++++++++++++++++----------- include/asm-x86/numa_64.h | 3 ++- 5 files changed, 44 insertions(+), 18 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2a59dbb2824..07cf7711356 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -903,6 +903,15 @@ config X86_64_ACPI_NUMA help Enable ACPI SRAT based node topology detection. +# Some NUMA nodes have memory ranges that span +# other nodes. Even though a pfn is valid and +# between a node's start and end pfns, it may not +# reside on that node. See memmap_init_zone() +# for details. +config NODES_SPAN_OTHER_NODES + def_bool y + depends on X86_64_ACPI_NUMA + config NUMA_EMU bool "NUMA emulation" depends on X86_64 && NUMA diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 7a2ebce87df..86808e666f9 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -164,7 +164,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (!found) return -1; - memnode_shift = compute_hash_shift(nodes, 8); + memnode_shift = compute_hash_shift(nodes, 8, NULL); if (memnode_shift < 0) { printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); return -1; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 2ea56f48f29..cb317018635 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -60,7 +60,7 @@ unsigned long __initdata nodemap_size; * -1 if node overlap or lost ram (shift too big) */ static int __init populate_memnodemap(const struct bootnode *nodes, - int numnodes, int shift) + int numnodes, int shift, int *nodeids) { unsigned long addr, end; int i, res = -1; @@ -76,7 +76,12 @@ static int __init populate_memnodemap(const struct bootnode *nodes, do { if (memnodemap[addr >> shift] != NUMA_NO_NODE) return -1; - memnodemap[addr >> shift] = i; + + if (!nodeids) + memnodemap[addr >> shift] = i; + else + memnodemap[addr >> shift] = nodeids[i]; + addr += (1UL << shift); } while (addr < end); res = 1; @@ -139,7 +144,8 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes, return i; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +int __init compute_hash_shift(struct bootnode *nodes, int numnodes, + int *nodeids) { int shift; @@ -149,7 +155,7 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes) printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); - if (populate_memnodemap(nodes, numnodes, shift) != 1) { + if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { printk(KERN_INFO "Your memory is not aligned you need to " "rebuild your kernel with a bigger NODEMAPSIZE " "shift=%d\n", shift); @@ -462,7 +468,7 @@ done: } } out: - memnode_shift = compute_hash_shift(nodes, num_nodes); + memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); if (memnode_shift < 0) { memnode_shift = 0; printk(KERN_ERR "No NUMA hash function found. NUMA emulation " diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 1bae9c855ce..fb43d89f46f 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -32,6 +32,10 @@ static struct bootnode nodes_add[MAX_NUMNODES]; static int found_add_area __initdata; int hotadd_percent __initdata = 0; +static int num_node_memblks __initdata; +static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; +static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; + /* Too small nodes confuse the VM badly. Usually they result from BIOS bugs. */ #define NODE_MIN_SIZE (4*1024*1024) @@ -41,17 +45,17 @@ static __init int setup_node(int pxm) return acpi_map_pxm_to_node(pxm); } -static __init int conflicting_nodes(unsigned long start, unsigned long end) +static __init int conflicting_memblks(unsigned long start, unsigned long end) { int i; - for_each_node_mask(i, nodes_parsed) { - struct bootnode *nd = &nodes[i]; + for (i = 0; i < num_node_memblks; i++) { + struct bootnode *nd = &node_memblk_range[i]; if (nd->start == nd->end) continue; if (nd->end > start && nd->start < end) - return i; + return memblk_nodeid[i]; if (nd->end == end && nd->start == start) - return i; + return memblk_nodeid[i]; } return -1; } @@ -258,7 +262,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) bad_srat(); return; } - i = conflicting_nodes(start, end); + i = conflicting_memblks(start, end); if (i == node) { printk(KERN_WARNING "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", @@ -283,10 +287,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) nd->end = end; } - printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, - nd->start, nd->end); - e820_register_active_regions(node, nd->start >> PAGE_SHIFT, - nd->end >> PAGE_SHIFT); + printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, + start, end); + e820_register_active_regions(node, start >> PAGE_SHIFT, + end >> PAGE_SHIFT); push_node_boundaries(node, nd->start >> PAGE_SHIFT, nd->end >> PAGE_SHIFT); @@ -298,6 +302,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) if ((nd->start | nd->end) == 0) node_clear(node, nodes_parsed); } + + node_memblk_range[num_node_memblks].start = start; + node_memblk_range[num_node_memblks].end = end; + memblk_nodeid[num_node_memblks] = node; + num_node_memblks++; } /* Sanity check to catch more bad SRATs (they are amazingly common). @@ -368,7 +377,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } - memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); + memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, + memblk_nodeid); if (memnode_shift < 0) { printk(KERN_ERR "SRAT: No NUMA node hash function found. Contact maintainer\n"); diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h index 32c22ae0709..22e87c9f6a8 100644 --- a/include/asm-x86/numa_64.h +++ b/include/asm-x86/numa_64.h @@ -9,7 +9,8 @@ struct bootnode { u64 end; }; -extern int compute_hash_shift(struct bootnode *nodes, int numnodes); +extern int compute_hash_shift(struct bootnode *nodes, int numblks, + int *nodeids); #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) -- 2.46.0