[POWERPC] Optimize counting distinct entries in the relocation sections

When a module has relocation sections with tens of thousands of entries, counting the distinct/unique entries only (i.e. no duplicates) at load time can take tens of seconds and up to minutes. The sore point is the count_relocs() function which is called as part of the architecture specific module loading processing path: -> load_module() generic -> module_frob_arch_sections() arch specific -> get_plt_size() 32-bit -> get_stubs_size() 64-bit -> count_relocs() Here count_relocs is being called to find out how many distinct targets of R_PPC_REL24 relocations there are, since each distinct target needs a PLT entry or a stub created for it. The previous counting algorithm has O(n^2) complexity. Basically two solutions were proposed on the e-mail list: a hash based approach and a sort based approach. The hash based approach is the fastest (O(n)) but the has it needs additional memory and for certain corner cases it could take lots of memory due to the degeneration of the hash. One such proposal was submitted here: http://ozlabs.org/pipermail/linuxppc-dev/2007-June/037641.html The sort based approach is slower (O(n * log n + n)) but if the sorting is done "in place" it doesn't need additional memory. This has O(n + n * log n) complexity with no additional memory requirements. This commit implements the in-place sort option. Signed-off-by: Emil Medve <Emilian.Medve@Freescale.com> Signed-off-by: Paul Mackerras <paulus@samba.org>
author: Emil Medve <Emilian.Medve@Freescale.com> 2007-11-14 03:24:04 +1100
committer: Paul Mackerras <paulus@samba.org> 2007-12-21 15:05:58 +1100
commit: eda09fbdcd8c5afaa81c2f1d28e8b9725bad4d5a (patch)
tree: 0afe6e81172fc9508f8e4c13598a276fe3d043c6 /arch/powerpc/kernel/module_32.c
parent: 1fe58a875e4bb08125c657b1b91ac515d2bdbcbe (diff)
1 files changed, 63 insertions, 14 deletions
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index 07a89a39863..eab31385831 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -24,6 +24,7 @@
 #include <linux/kernel.h>
 #include <linux/cache.h>
 #include <linux/bug.h>
+#include <linux/sort.h>
 
 #include "setup.h"
 
@@ -54,22 +55,60 @@ void module_free(struct module *mod, void *module_region)
    addend) */
 static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
 {
-	unsigned int i, j, ret = 0;
-
-	/* Sure, this is order(n^2), but it's usually short, and not
-           time critical */
-	for (i = 0; i < num; i++) {
-		for (j = 0; j < i; j++) {
-			/* If this addend appeared before, it's
-                           already been counted */
-			if (ELF32_R_SYM(rela[i].r_info)
-			    == ELF32_R_SYM(rela[j].r_info)
-			    && rela[i].r_addend == rela[j].r_addend)
-				break;
+	unsigned int i, r_info, r_addend, _count_relocs;
+
+	_count_relocs = 0;
+	r_info = 0;
+	r_addend = 0;
+	for (i = 0; i < num; i++)
+		/* Only count 24-bit relocs, others don't need stubs */
+		if (ELF32_R_TYPE(rela[i].r_info) == R_PPC_REL24 &&
+		    (r_info != ELF32_R_SYM(rela[i].r_info) ||
+		     r_addend != rela[i].r_addend)) {
+			_count_relocs++;
+			r_info = ELF32_R_SYM(rela[i].r_info);
+			r_addend = rela[i].r_addend;
 		}
-		if (j == i) ret++;
+
+	return _count_relocs;
+}
+
+static int relacmp(const void *_x, const void *_y)
+{
+	const Elf32_Rela *x, *y;
+
+	y = (Elf32_Rela *)_x;
+	x = (Elf32_Rela *)_y;
+
+	/* Compare the entire r_info (as opposed to ELF32_R_SYM(r_info) only) to
+	 * make the comparison cheaper/faster. It won't affect the sorting or
+	 * the counting algorithms' performance
+	 */
+	if (x->r_info < y->r_info)
+		return -1;
+	else if (x->r_info > y->r_info)
+		return 1;
+	else if (x->r_addend < y->r_addend)
+		return -1;
+	else if (x->r_addend > y->r_addend)
+		return 1;
+	else
+		return 0;
+}
+
+static void relaswap(void *_x, void *_y, int size)
+{
+	uint32_t *x, *y, tmp;
+	int i;
+
+	y = (uint32_t *)_x;
+	x = (uint32_t *)_y;
+
+	for (i = 0; i < sizeof(Elf32_Rela) / sizeof(uint32_t); i++) {
+		tmp = x[i];
+		x[i] = y[i];
+		y[i] = tmp;
 	}
-	return ret;
 }
 
 /* Get the potential trampolines size required of the init and
@@ -100,6 +139,16 @@ static unsigned long get_plt_size(const Elf32_Ehdr *hdr,
 			DEBUGP("Ptr: %p.  Number: %u\n",
 			       (void *)hdr + sechdrs[i].sh_offset,
 			       sechdrs[i].sh_size / sizeof(Elf32_Rela));
+
+			/* Sort the relocation information based on a symbol and
+			 * addend key. This is a stable O(n*log n) complexity
+			 * alogrithm but it will reduce the complexity of
+			 * count_relocs() to linear complexity O(n)
+			 */
+			sort((void *)hdr + sechdrs[i].sh_offset,
+			     sechdrs[i].sh_size / sizeof(Elf32_Rela),
+			     sizeof(Elf32_Rela), relacmp, relaswap);
+
 			ret += count_relocs((void *)hdr
 					     + sechdrs[i].sh_offset,
 					     sechdrs[i].sh_size
author	Emil Medve <Emilian.Medve@Freescale.com>	2007-11-14 03:24:04 +1100
committer	Paul Mackerras <paulus@samba.org>	2007-12-21 15:05:58 +1100
commit	eda09fbdcd8c5afaa81c2f1d28e8b9725bad4d5a (patch)
tree	0afe6e81172fc9508f8e4c13598a276fe3d043c6 /arch/powerpc/kernel/module_32.c
parent	1fe58a875e4bb08125c657b1b91ac515d2bdbcbe (diff)