/****************************************************************************** ** ** FILE NAME : avm_membench.c ** AUTHOR (MIPS): Christoph Buettner & Heiko Blobner ** ADJUSTED for ARM: Mario Bahr *******************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../../../fs/proc/internal.h" // Eklig: Wird für PDE gebraucht. #if IS_ENABLED(CONFIG_ARCH_IPQ806X_DT) #include #endif #if defined(CONFIG_MACH_BCM963138) #include #endif /*------------------------------------------------------------------------------------------*\ \*------------------------------------------------------------------------------------------*/ #define BLOCK_ELEMENT_BYTE 2048 #define TOTAL_BLOCKS (BLOCK_ELEMENT_BYTE * 1) /*--- in KiB ---*/ #if defined(CONFIG_MACH_PUMA6) #include static unsigned long membench_cpu_clock = 450 * 1000 * 1000; static unsigned long membench_bus_clock = 250 * 1000 * 1000; static unsigned long membench_wortbreite = 32; #elif defined(CONFIG_ARCH_PUMA5) || defined(CONFIG_MACH_PUMA5) static unsigned long membench_cpu_clock = 400 * 1000 * 1000; static unsigned long membench_bus_clock = 200 * 1000 * 1000; static unsigned long membench_wortbreite = 16; #elif defined(CONFIG_ARCH_IPQ806X_DT) static unsigned long membench_cpu_clock = 1400 * 1000 * 1000; static unsigned long membench_bus_clock = 533 * 1000 * 1000 * 2; static unsigned long membench_wortbreite = 32; #elif defined(CONFIG_MACH_BCM963138) static unsigned long membench_cpu_clock = 1000 * 1000 * 1000; static unsigned long membench_bus_clock = 800 * 1000 * 1000 * 2; static unsigned long membench_wortbreite = 16; #elif defined(CONFIG_ARCH_IPQ40XX) static unsigned long membench_cpu_clock = 7100 * 100 * 1000; static unsigned long membench_bus_clock = 5376 * 100 * 1000 * 2; static unsigned long membench_wortbreite = 16; #elif defined(CONFIG_ARCH_IPQ5018) static unsigned long membench_cpu_clock = 1000 * 1000 * 1000; static unsigned long membench_bus_clock = 800 * 1000 * 1000 * 2; static unsigned long membench_wortbreite = 16; #else #warning "Unknown Architecture!!!" static unsigned long membench_cpu_clock = 0; static unsigned long membench_bus_clock = 0; static unsigned long membench_wortbreite = 0; #endif static uint32_t loops = 1; #if defined(CONFIG_ARCH_IPQ806X_DT) || defined(CONFIG_MACH_BCM963138) || defined(CONFIG_ARCH_IPQ40XX) || defined(CONFIG_ARCH_IPQ5018) /* extern void v7_flush_kern_dcache_area(unsigned long page_address, ssize_t size); extern void v7_dma_flush_range(unsigned long start_virt_addr, unsigned long end_virt_addr); */ // Die folgenden 2 Definitionen funktionieren für Kernel 3.14 für dakota extern void v7_flush_kern_dcache_area(void* page_address, size_t size); extern void v7_dma_flush_range(const void* start_virt_addr, const void* end_virt_addr); static void dma_cache_inv(unsigned long start_virt_addr, size_t len) { v7_dma_flush_range((const void*)start_virt_addr, (const void*)(start_virt_addr + len - 1)); } /* static void (*dma_cache_wback_inv)(unsigned long page_address, ssize_t size) = v7_flush_kern_dcache_area; */ static void dma_cache_wback_inv(unsigned long page_address, ssize_t size) { v7_flush_kern_dcache_area((void*)page_address, size); } #endif static unsigned long Cycle_Shift = 0; static unsigned long long cycle_to_sec(unsigned long a) __attribute__((pure)); static unsigned long long cycle_to_sec(unsigned long a) { return ((a) * ((Cycle_Shift) ? membench_cpu_clock / 2 : membench_cpu_clock)); } #ifdef CONFIG_MACH_BCM963138 static void arm_performance_counter_init(void) { } static inline unsigned long long arm_cpu_cycles(void) { unsigned int count; count = bcm63xx_read_timer_count2(); BUG_ON(membench_cpu_clock % 50000000); return (unsigned long long)count * (membench_cpu_clock / 50000000); } #elif defined(CONFIG_ARCH_IPQ806X_DT) static void arm_performance_counter_init(void) { } static inline unsigned long long arm_cpu_cycles(void) { const uint32_t timer_freq = msm_get_timer_freq(); unsigned long timer_ticks; static unsigned long last_timer_ticks; static unsigned long long timer_offset = 0ULL; timer_ticks = msm_get_timer_count(); if(timer_ticks < last_timer_ticks) { timer_offset += 1ULL << 32; } last_timer_ticks = timer_ticks; BUG_ON(membench_cpu_clock % timer_freq); return ((unsigned long long)timer_ticks + timer_offset) * (membench_cpu_clock / timer_freq); } #elif defined(CONFIG_ARCH_IPQ40XX) || defined(CONFIG_ARCH_IPQ5018) extern u32 arch_timer_get_rate(void); extern u64 (*arch_timer_read_counter)(void); static void arm_performance_counter_init(void) { } static inline unsigned long long arm_cpu_cycles(void) { unsigned int timer_freq = arch_timer_get_rate(); u64 timer_ticks = arch_timer_read_counter(); return (timer_ticks * (membench_cpu_clock / timer_freq)); } #else /*--------------------------------------------------------------------------------*\ * der performance-Counter wird auch fuer get_cycles() verwendet! \*--------------------------------------------------------------------------------*/ static void arm_performance_counter_init(void) { union __performance_monitor_control C; write_secure_debug_enable_register(0, 1); C.Register = read_p15_performance_monitor_control(); if (C.Bits.EnableCounters && C.Bits.CycleCounterDivider) { Cycle_Shift = 5; return; } C.Bits.CycleCounterDivider = 0; /*--- entspricht damit CPU-Takt ---*/ C.Bits.EnableCounters = 1; write_p15_performance_monitor_control(C.Register); pr_debug("%s: enable cycle_count performance-monitor-register: %x\n", __func__, read_p15_performance_monitor_control()); } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ #define arm_cpu_cycles() (read_p15_cycle_counter() << Cycle_Shift) #endif static unsigned long long mess_laenge; static void mess_laenge_set(unsigned long seconds) { mess_laenge = (membench_cpu_clock * seconds); } static unsigned long long div_mod_64_32(unsigned long long dividend, unsigned long divisor, unsigned long long *remainder) { unsigned long long result = 0; BUG_ON(!divisor); while (dividend >= divisor) { dividend -= divisor; result++; } if (remainder) *remainder = dividend; return result; } static unsigned long long div64_32(unsigned long long dividend, unsigned long long divisor) { return div_mod_64_32(dividend, divisor, NULL); } static unsigned long long mod64_32(unsigned long long dividend, unsigned long long divisor) { unsigned long long remainder = 0; div_mod_64_32(dividend, divisor, &remainder); return remainder; } static unsigned long long zeit_s(void) __attribute__((pure)); static unsigned long long zeit_s(void) { if (IS_ENABLED(CONFIG_ARM) && IS_ENABLED(CONFIG_CPU_V7)) return div64_32(mess_laenge, cycle_to_sec(1)); else return mess_laenge / cycle_to_sec(1); } static unsigned long long zeit_ms(void) __attribute__ ((pure)); static unsigned long long zeit_ms(void) { unsigned long long cycles_per_sec = cycle_to_sec(1); if (IS_ENABLED(CONFIG_ARM) && IS_ENABLED(CONFIG_CPU_V7)) { unsigned long long dividend = mod64_32(mess_laenge, cycles_per_sec); unsigned long long divisor = div64_32(cycles_per_sec, 1000); return div64_32(dividend, divisor); } else { return (mess_laenge % cycles_per_sec) / (cycles_per_sec / 1000); } } static unsigned long long kb_pro_sec(unsigned long kb, unsigned long loops) __attribute__((pure)); static unsigned long long kb_pro_sec(unsigned long kb, unsigned long loops) { unsigned long long dividend = (kb / loops) * 1000; if (IS_ENABLED(CONFIG_ARM) && IS_ENABLED(CONFIG_CPU_V7)) { unsigned long long divisor = zeit_s() * 1000 + zeit_ms(); return div64_32(dividend, divisor); } else { return dividend / (zeit_s() * 1000 + zeit_ms()); } } static unsigned long worte_pro_sec(unsigned long kb, unsigned long loops, unsigned wortbreite) __attribute__((pure)); static unsigned long worte_pro_sec(unsigned long kb, unsigned long loops, unsigned wortbreite) { return kb_pro_sec(kb, loops) * (1024 / (wortbreite / 8)); } static unsigned long worte_pro_clock_1(unsigned long kb, unsigned long loops, unsigned wortbreite) __attribute__((pure)); static unsigned long worte_pro_clock_1(unsigned long kb, unsigned long loops, unsigned wortbreite) { return membench_bus_clock / worte_pro_sec(kb, loops, wortbreite); } static unsigned long worte_pro_clock_10(unsigned long kb, unsigned long loops, unsigned long wortbreite) __attribute__((pure)); static unsigned long worte_pro_clock_10(unsigned long kb, unsigned long loops, unsigned long wortbreite) { return (membench_bus_clock / (worte_pro_sec(kb, loops, wortbreite) / 1000)) % 1000; } /*------------------------------------------------------------------------------------------*\ * Pipeline-friendly Read * * -16x 4-byte-Werte pro Schleifendurchlauf * -> 16 Lesezugriffe pro Schleifendurchlauf \*------------------------------------------------------------------------------------------*/ static unsigned long do_measure__read_pipe(char *mem, int irqsave, int loops) { int i; unsigned long flags; unsigned long kb = 0; unsigned long long measure_start = 0; unsigned long long measure_end = 0; arm_performance_counter_init(); for (i = 0; i < loops; i++) { unsigned long long time_in_double_cpu_clocks = 0; if (irqsave) { local_irq_save(flags); } dma_cache_inv((unsigned long)mem, TOTAL_BLOCKS * 1024); do { unsigned long long tick_value; unsigned long long tick_value_end; register int p = (unsigned int)mem; register int p_end = p + (TOTAL_BLOCKS * 1024); tick_value = arm_cpu_cycles(); if (!measure_start) measure_start = tick_value; /*--------------------------------------------------------------------------------*\ * So sieht Schleife aus: f48: e8bc00ff ldm ip!, {r0, r1, r2, r3, r4, r5, r6, r7} f4c: e8bc00ff ldm ip!, {r0, r1, r2, r3, r4, r5, r6, r7} f50: e24ee001 sub lr, lr, #1 f54: e6ffe07e uxth lr, lr f58: e35e0000 cmp lr, #0 f5c: 1afffff9 bne f48 \*--------------------------------------------------------------------------------*/ while (p < p_end) {/*--- 64 Byte pro Durchlauf ---*/ register int dummy0 asm("r0"); register int dummy1 asm("r1"); register int dummy2 asm("r2"); register int dummy3 asm("r3"); register int dummy4 asm("r4"); register int dummy5 asm("r5"); register int dummy6 asm("r6"); register int dummy7 asm("r7"); __asm__ __volatile__ ("ldmia %8!, { %0,%1,%2,%3,%4,%5,%6,%7 }\n" : "=r" (dummy0), "=r" (dummy1), "=r" (dummy2), "=r" (dummy3), "=r" (dummy4), "=r" (dummy5), "=r" (dummy6), "=r" (dummy7), "+r" (p) : ); __asm__ __volatile__ ("ldmia %8!, { %0,%1,%2,%3,%4,%5,%6,%7 }\n" : "=r" (dummy0), "=r" (dummy1), "=r" (dummy2), "=r" (dummy3), "=r" (dummy4), "=r" (dummy5), "=r" (dummy6), "=r" (dummy7), "+r" (p) : ); } measure_end = tick_value_end = arm_cpu_cycles(); if (tick_value_end < tick_value) pr_warn("%s: timer register overrun! Please " "repeat benchmark!\n", __func__); if (tick_value_end == tick_value) { pr_warn("%s: test is too fast!\n", __func__); break; } time_in_double_cpu_clocks += (tick_value_end - tick_value); kb += TOTAL_BLOCKS; } while (time_in_double_cpu_clocks < mess_laenge); if (irqsave) { local_irq_restore(flags); } printk("*"); } printk(" => 0x%llx .. 0x%llx -> 0x%llx\n", measure_start, measure_end, measure_end - measure_start); return kb; } /*------------------------------------------------------------------------------------------*\ * Extreme Read * * -16x 4-byte-Werte werden jeweils von 4 unterschiedlichen Adressen gelesen * -> 16*4 Lesezugriffe pro Schleifendurchlauf \*------------------------------------------------------------------------------------------*/ static unsigned long do_measure__read_extreme(char *mem, int irqsave, int loops) { int i; int x; unsigned int *local_mem[4]; unsigned long flags; unsigned long kb = 0; unsigned long long measure_start = 0; unsigned long long measure_end = 0; arm_performance_counter_init(); for (x = 0; x < loops; x++) { unsigned long long time_in_double_cpu_clocks = 0; unsigned long long tick_value; unsigned long long tick_value_end; if (irqsave) { local_irq_save(flags); } dma_cache_inv((unsigned long)mem, TOTAL_BLOCKS * 1024); do { unsigned int p_end; for (i = 0; i < 4; i++) { local_mem[i] = (unsigned int *) (mem + (i * (TOTAL_BLOCKS * 1024 / 4))); } p_end = (unsigned int)local_mem[1]; tick_value = arm_cpu_cycles(); if (!measure_start) measure_start = tick_value; for (; (unsigned int)local_mem[0] < p_end; local_mem[0] += BLOCK_ELEMENT_BYTE / sizeof(unsigned int), local_mem[1] += BLOCK_ELEMENT_BYTE / sizeof(unsigned int), local_mem[2] += BLOCK_ELEMENT_BYTE / sizeof(unsigned int), local_mem[3] += BLOCK_ELEMENT_BYTE / sizeof(unsigned int)) { register int dummy0 asm("r0"); register int dummy1 asm("r1"); register int dummy2 asm("r2"); register int dummy3 asm("r3"); register unsigned int p0 = (unsigned int)local_mem[0]; register unsigned int p1 = (unsigned int)local_mem[1]; register unsigned int p2 = (unsigned int)local_mem[2]; register unsigned int p3 = (unsigned int)local_mem[3]; __asm__ __volatile__ (" ldr %0, [%1, #0]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #0]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #0]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #0]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #4]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #4]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #4]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #4]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #8]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #8]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #8]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #8]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #12]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #12]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #12]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #12]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #16]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #16]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #16]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #16]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #20]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #20]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #20]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #20]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #24]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #24]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #24]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #24]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #28]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #28]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #28]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #28]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #32]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #32]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #32]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #32]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #36]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #36]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #36]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #36]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #40]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #40]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #40]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #40]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #44]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #44]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #44]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #44]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #48]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #48]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #48]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #48]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #52]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #52]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #52]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #52]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #56]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #56]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #56]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #56]\n" : "=r" (dummy3) : "r" (p3)); __asm__ __volatile__ (" ldr %0, [%1, #60]\n" : "=r" (dummy0) : "r" (p0)); __asm__ __volatile__ (" ldr %0, [%1, #60]\n" : "=r" (dummy1) : "r" (p1)); __asm__ __volatile__ (" ldr %0, [%1, #60]\n" : "=r" (dummy2) : "r" (p2)); __asm__ __volatile__ (" ldr %0, [%1, #60]\n" : "=r" (dummy3) : "r" (p3)); } measure_end = tick_value_end = arm_cpu_cycles(); if (tick_value_end < tick_value) pr_warn("%s: timer register overrun! Please " "repeat benchmark!\n", __func__); if (tick_value_end == tick_value) { pr_warn("%s: test is too fast!\n", __func__); break; } time_in_double_cpu_clocks += (tick_value_end - tick_value); kb += TOTAL_BLOCKS; } while (time_in_double_cpu_clocks < mess_laenge); if (irqsave) { local_irq_restore(flags); } printk("."); } printk(" => 0x%llx .. 0x%llx -> 0x%llx\n", measure_start, measure_end, measure_end - measure_start); return kb; } /*------------------------------------------------------------------------------------------*\ * Mixture Read/Write * * -1x 4-Byte Lesen + 1x 4-Byte Schreiben * -> 2 Zugriffe pro Schleifendurchlauf \*------------------------------------------------------------------------------------------*/ static unsigned long do_measure__read_mixture(char *mem, int irqsave, int loops) { int i; unsigned long flags; unsigned long kb = 0; unsigned long long measure_start = 0; unsigned long long measure_end = 0; arm_performance_counter_init(); for (i = 0; i < loops; i++) { unsigned long long time_in_double_cpu_clocks = 0; if (irqsave) { local_irq_save(flags); } dma_cache_inv((unsigned long)mem, TOTAL_BLOCKS * 1024); do { unsigned long long tick_value; unsigned long long tick_value_end; register unsigned int p = (unsigned int)mem; register unsigned int p_end = (unsigned int)mem + (TOTAL_BLOCKS * 1024); tick_value = arm_cpu_cycles(); if (!measure_start) measure_start = tick_value; for (; p < p_end; p += sizeof(unsigned int) * 4) { register int dummy0 asm("r0"); __asm__ __volatile__ (" ldr %0, [%1, #0] \n" : "=r" (dummy0) : "r" (p)); __asm__ __volatile__ (" str %0, [%1, #0] \n" : : "r" (dummy0), "r" (p) ); __asm__ __volatile__ (" ldr %0, [%1, #4] \n" : "=r" (dummy0) : "r" (p)); __asm__ __volatile__ (" str %0, [%1, #4] \n" : : "r" (dummy0), "r" (p) ); __asm__ __volatile__ (" ldr %0, [%1, #8] \n" : "=r" (dummy0) : "r" (p)); __asm__ __volatile__ (" str %0, [%1, #8] \n" : : "r" (dummy0), "r" (p) ); __asm__ __volatile__ (" ldr %0, [%1, #12] \n" : "=r" (dummy0) : "r" (p)); __asm__ __volatile__ (" str %0, [%1, #12] \n" : : "r" (dummy0), "r" (p) ); } dma_cache_wback_inv((unsigned long)mem, TOTAL_BLOCKS * 1024); measure_end = tick_value_end = arm_cpu_cycles(); if (tick_value_end < tick_value) pr_warn("%s: timer register overrun! Please " "repeat benchmark!\n", __func__); if (tick_value_end == tick_value) { pr_warn("%s: test is too fast!\n", __func__); break; } time_in_double_cpu_clocks += (tick_value_end - tick_value); kb += TOTAL_BLOCKS; } while (time_in_double_cpu_clocks < mess_laenge); if (irqsave) { local_irq_restore(flags); } printk("*"); } printk(" => 0x%llx .. 0x%llx -> 0x%llx\n", measure_start, measure_end, measure_end - measure_start); return kb; } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ static unsigned long do_measure__write(char *mem, int irqsave, int loops) { int i; unsigned long flags; unsigned long kb = 0; unsigned long long measure_start = 0; unsigned long long measure_end = 0; arm_performance_counter_init(); for (i = 0; i < loops; i++) { unsigned long long time_in_double_cpu_clocks = 0; if (irqsave) { local_irq_save(flags); } dma_cache_inv((unsigned long)mem, TOTAL_BLOCKS * 1024); do { unsigned long long tick_value; unsigned long long tick_value_end; register unsigned int p = (unsigned int)mem; register unsigned int p_end = p + (TOTAL_BLOCKS * 1024); tick_value = arm_cpu_cycles(); if (!measure_start) measure_start = tick_value; while (p < p_end) { register int dummy0 = 23; __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); __asm__ __volatile__("stmia %0!, { %1 }\n" : "+r" (p) : "r"(dummy0)); } dma_cache_wback_inv((unsigned long)mem, TOTAL_BLOCKS * 1024); measure_end = tick_value_end = arm_cpu_cycles(); if (tick_value_end < tick_value) pr_warn("%s: timer register overrun! Please " "repeat benchmark!\n", __func__); if (tick_value_end == tick_value) { pr_warn("%s: test is too fast!\n", __func__); break; } time_in_double_cpu_clocks += (tick_value_end - tick_value); kb += TOTAL_BLOCKS; } while (time_in_double_cpu_clocks < mess_laenge); if (irqsave) { local_irq_restore(flags); } printk("*"); } printk(" => 0x%llx .. 0x%llx -> 0x%llx\n", measure_start, measure_end, measure_end - measure_start); return kb; } /*------------------------------------------------------------------------------------------*\ * Simple Write * * -1x 4-Byte Schreiben * -> 1 Zugriff pro Schleifendurchlauf \*------------------------------------------------------------------------------------------*/ static unsigned long do_measure__writeburst(char *mem, int irqsave, int loops) { int i; unsigned long flags; unsigned long long kb = 0; unsigned long long measure_start = 0; unsigned long long measure_end = 0; unsigned long long count = 0; arm_performance_counter_init(); for (i = 0; i < loops; i++) { unsigned long long time_in_double_cpu_clocks = 0; if (irqsave) { local_irq_save(flags); } dma_cache_inv((unsigned long)mem, TOTAL_BLOCKS * 1024); do { unsigned long long tick_value; unsigned long long tick_value_end; register unsigned int p = (unsigned int)mem; tick_value = arm_cpu_cycles(); if (!measure_start) measure_start = tick_value; /*--------------------------------------------------------------------------------*\ * so wieht Schleife in Assembler aus (top!): 540: e8a35273 stmia r3!, {r0, r1, r4, r5, r6, r9, ip, lr} 544: e8a35273 stmia r3!, {r0, r1, r4, r5, r6, r9, ip, lr} 548: e2422001 sub r2, r2, #1 54c: e6ff2072 uxth r2, r2 550: e3520000 cmp r2, #0 554: 1afffff9 bne 540 \*--------------------------------------------------------------------------------*/ count = 0; while (count < ((TOTAL_BLOCKS * 1024) / 64)) { register int dummy0 asm("r0") = 23; register int dummy1 asm("r1") = 24; register int dummy2 asm("r2") = 25; register int dummy3 asm("r3") = 26; register int dummy4 asm("r4") = 27; register int dummy5 asm("r5") = 28; register int dummy6 asm("r6") = 29; register int dummy7 asm("r7") = 30; __asm__ __volatile__ ("stmia %0!, { %1,%2,%3,%4,%5,%6,%7,%8 }\n": "+r" (p) : "r"(dummy0), "r"(dummy1), "r"(dummy2), "r"(dummy3), "r"(dummy4), "r"(dummy5), "r"(dummy6), "r"(dummy7)); __asm__ __volatile__ ("stmia %0!, { %1,%2,%3,%4,%5,%6,%7,%8 }\n": "+r" (p) : "r"(dummy0), "r"(dummy1), "r"(dummy2), "r"(dummy3), "r"(dummy4), "r"(dummy5), "r"(dummy6), "r"(dummy7)); count++; } dma_cache_wback_inv((unsigned long)mem, TOTAL_BLOCKS * 1024); measure_end = tick_value_end = arm_cpu_cycles(); if (tick_value_end < tick_value) pr_warn("%s: timer register overrun! Please " "repeat benchmark!\n", __func__); if (tick_value_end == tick_value) { pr_warn("%s: test is too fast!\n", __func__); break; } time_in_double_cpu_clocks += (tick_value_end - tick_value); kb += TOTAL_BLOCKS; } while (time_in_double_cpu_clocks < mess_laenge); if (irqsave) { local_irq_restore(flags); } printk("* kb=%llu, count=%llu\n", kb, count); } printk(" => 0x%llx .. 0x%llx -> 0x%llx\n", measure_start, measure_end, measure_end - measure_start); return kb; } /*------------------------------------------------------------------------------------------*\ \*------------------------------------------------------------------------------------------*/ static void print_head(struct seq_file *m, int loops, int wortbreite) { seq_puts(m, "\n\n"); seq_puts(m, "AVM-RAM-Benchmark\n"); seq_puts(m, "=============================================\n"); seq_puts(m, "IRQs: off (alle Tests mit deaktivierten IRQs)\n"); seq_printf(m, "CPU-Clock: %lu\n", membench_cpu_clock); seq_printf(m, "RAM-Clock: %lu (eff. Datentaktrate)\n", membench_bus_clock); seq_printf(m, "BUS-Breite (Word=): %d Bit\n", wortbreite); seq_printf(m, "Measure-Time: %d * %llu.%llus\n\n", loops, zeit_s(), zeit_ms()); seq_printf(m, " -- Results --\n"); seq_puts(m, "=============================================================================\n"); seq_printf(m, " type | total read | loops | DDR-Ticks | %2dBit |\n", wortbreite); seq_printf(m, " | in kByte | | /%2dBit | Worte/s | kByte/s\n", wortbreite); seq_puts(m, "=============================================================================\n"); udelay(100); } /*------------------------------------------------------------------------------------------*\ \*------------------------------------------------------------------------------------------*/ static void print_read_pipe(struct seq_file *m, int loops, int wortbreite, char *kmem) { unsigned long kb; kb = do_measure__read_pipe(kmem, 1, loops); seq_printf(m, "read | %7lu | %1d | %5lu.%03lu | %9lu | %6llu\n", kb, loops, worte_pro_clock_1(kb, loops, wortbreite), worte_pro_clock_10(kb, loops, wortbreite), worte_pro_sec(kb, loops, wortbreite), kb_pro_sec(kb, loops)); seq_puts(m, " | | | | |\n"); seq_puts(m, "Burstartiges Lesen aus dem RAM unter Nutzung von load multiple.\n"); seq_puts(m, "-----------------------------------------------------------------------------\n"); } /*------------------------------------------------------------------------------------------*\ \*------------------------------------------------------------------------------------------*/ static void print_read_extreme(struct seq_file *m, int loops, int wortbreite, char *kmem) { unsigned long kb; kb = do_measure__read_extreme(kmem, 1, loops); seq_printf(m, "read | %7lu | %1d | %5lu.%03lu | %9lu | %6llu\n", kb, loops, worte_pro_clock_1(kb, loops, wortbreite), worte_pro_clock_10(kb, loops, wortbreite), worte_pro_sec(kb, loops, wortbreite), kb_pro_sec(kb, loops)); seq_puts(m, " | | | | |\n"); seq_puts(m, "Die gelesenen Werte stehen im Speicher nicht hintereinander.\n"); seq_puts(m, "D.h. die CPU kann den Cache nicht nutzen.\n"); seq_puts(m, "-----------------------------------------------------------------------------\n"); } /*------------------------------------------------------------------------------------------*\ \*------------------------------------------------------------------------------------------*/ static void print_readwrite(struct seq_file *m, int loops, int wortbreite, char *kmem) { unsigned long kb; kb = do_measure__read_mixture(kmem, 1, loops); seq_printf(m, "read/write | %7lu | %1d | %5lu.%03lu | %9lu | %6llu\n", kb, loops, worte_pro_clock_1(kb, loops, wortbreite), worte_pro_clock_10(kb, loops, wortbreite), worte_pro_sec(kb, loops, wortbreite), kb_pro_sec(kb, loops)); seq_puts(m, " | | | | |\n"); seq_puts(m, "Immer schoen im Wechsel 1x Lesen und 1x Schreiben.\n"); seq_puts(m, "-----------------------------------------------------------------------------\n"); } /*------------------------------------------------------------------------------------------*\ \*------------------------------------------------------------------------------------------*/ static void print_write(struct seq_file *m, int loops, int wortbreite, char *kmem) { unsigned long kb; kb = do_measure__write(kmem, 1, loops); seq_printf(m, "write | %7lu | %1d | %5lu.%03lu | %9lu | %6llu\n", kb, loops, worte_pro_clock_1(kb, loops, wortbreite), worte_pro_clock_10(kb, loops, wortbreite), worte_pro_sec(kb, loops, wortbreite), kb_pro_sec(kb, loops)); seq_puts(m, " | | | | |\n"); seq_puts(m, "Einfaches Schreiben (Cache-Nutzung).\n"); seq_puts(m, "-----------------------------------------------------------------------------\n"); } /*------------------------------------------------------------------------------------------*\ \*------------------------------------------------------------------------------------------*/ static void print_writeburst(struct seq_file *m, int loops, int wortbreite, char *kmem) { unsigned long kb; kb = do_measure__writeburst(kmem, 1, loops); seq_printf(m, "write | %7lu | %1d | %5lu.%03lu | %9lu | %6llu\n", kb, loops, worte_pro_clock_1(kb, loops, wortbreite), worte_pro_clock_10(kb, loops, wortbreite), worte_pro_sec(kb, loops, wortbreite), kb_pro_sec(kb, loops)); seq_puts(m, " | | | | |\n"); seq_puts(m, "Burst-Schreiben unter Nutzung von store multiple.\n"); seq_puts(m, "-----------------------------------------------------------------------------\n"); } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ static int do_help(struct seq_file *m, void *data __maybe_unused) { seq_puts(m, "\n\n"); seq_puts(m, "AVM-RAM-Benchmark (HELP)\n"); seq_puts(m, "=============================================\n"); seq_puts(m, "cat /proc/avm/benchmark/complete -> Durchfuehrung aller Benchmarks\n"); seq_puts(m, "cat /proc/avm/benchmark/help -> Anzeige dieser Hilfe\n"); seq_puts(m, "\n"); seq_puts(m, "cat /proc/avm/benchmark/do_read_extreme -> Read Bench\n"); seq_puts(m, " Lese Bench fuer nicht-lineares Lesen.\n"); seq_puts(m, "cat /proc/avm/benchmark/do_read_pipe -> Read Bench (unter Nutzung von Load-Multiple)\n"); seq_puts(m, " Pipeline orientierter Lese benchmark\n"); seq_puts(m, "cat /proc/avm/benchmark/do_read_write -> Read/Schreib Bench\n"); seq_puts(m, "cat /proc/avm/benchmark/do_write -> Schreib Bench\n"); seq_puts(m, "cat /proc/avm/benchmark/do_burstwrite -> Schreib Bench (unter Nutzung von Store-Multiple)\n"); seq_puts(m, "\n\n"); return 0; } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ static int do_whatever(struct seq_file *m, void (*func)(struct seq_file *, int, int, char *)) { char *kmem = kmalloc(TOTAL_BLOCKS * 1024, GFP_ATOMIC); if (unlikely(!kmem)) { seq_puts(m, "No memory for test\n"); return -ENOMEM; } print_head(m, loops, membench_wortbreite); func(m, loops, membench_wortbreite, kmem); seq_puts(m, "\n\n"); kfree(kmem); return 0; } /*------------------------------------------------------------------------------------------*\ \*------------------------------------------------------------------------------------------*/ static void _complete_membench(struct seq_file *m, int loops, int wortbreite, char *kmem) { print_read_pipe(m, loops, wortbreite, kmem); print_read_extreme(m, loops, wortbreite, kmem); print_readwrite(m, loops, wortbreite, kmem); print_write(m, loops, wortbreite, kmem); print_writeburst(m, loops, wortbreite, kmem); } static int do_complete_membench(struct seq_file *m, void *data __maybe_unused) { return do_whatever(m, _complete_membench); } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ static int do_read_extreme(struct seq_file *m, void *data __maybe_unused) { return do_whatever(m, print_read_extreme); } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ static int do_read_pipe(struct seq_file *m, void *data __maybe_unused) { return do_whatever(m, print_read_pipe); } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ static int do_read_write(struct seq_file *m, void *data __maybe_unused) { return do_whatever(m, print_readwrite); } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ static int do_write(struct seq_file *m, void *data __maybe_unused) { return do_whatever(m, print_write); } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ static int do_writeburst(struct seq_file *m, void *data __maybe_unused) { return do_whatever(m, print_writeburst); } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ static unsigned long kb_value_pro_sec(int loops, unsigned long x) __attribute__((pure)); static unsigned long kb_value_pro_sec(int loops, unsigned long x) { if (IS_ENABLED(CONFIG_ARM) && IS_ENABLED(CONFIG_CPU_V7)) { return div64_32((x / loops) * 1000, zeit_s() * 1000 + zeit_ms()); } else { return ((x / loops) * 1000 ) / (zeit_s() * 1000 + zeit_ms()); } } static int performance_index(struct seq_file *m, void *data __maybe_unused) { unsigned long kb_r_burst; unsigned long kb_w_burst; unsigned long kb_w_burst_enh; unsigned long kb_rw; unsigned long kb_r; unsigned int irqsave = 1; char *kmem = kmalloc(TOTAL_BLOCKS * 1024, GFP_ATOMIC); if (unlikely(!kmem)) { seq_puts(m, "No memory for test\n"); return -ENOMEM; } kb_r_burst = do_measure__read_pipe(kmem, irqsave, loops); kb_w_burst = do_measure__write(kmem, irqsave, loops); kb_w_burst_enh = do_measure__writeburst(kmem, irqsave, loops); kb_rw = do_measure__read_mixture(kmem, irqsave, loops); kb_r = do_measure__read_extreme(kmem, irqsave, loops); seq_printf(m, "Performance-Index: %lu\n", kb_value_pro_sec(loops, kb_r_burst)/1000*10 + kb_value_pro_sec(loops, kb_w_burst)/1000*8 + kb_value_pro_sec(loops, kb_w_burst_enh)/1000*2 + kb_value_pro_sec(loops, kb_rw)/1000*1 + kb_value_pro_sec(loops, kb_r)/1000*1); seq_printf(m, "CPU-Clock: %lu MHz\n", membench_cpu_clock/(1000*1000)); seq_printf(m, "RAM-Clock: %lu MHz\n", membench_bus_clock/(1000*1000)); kfree(kmem); return 0; } /*------------------------------------------------------------------------------------------*\ \*------------------------------------------------------------------------------------------*/ void early_membench(void) { static char buffer[1024]; struct seq_file fake_seq_file = { .buf = buffer, .size = sizeof(buffer), }; pr_err("running membench\n"); do_complete_membench(&fake_seq_file, NULL); pr_err("%s", buffer); } #define PROC_BENCHDIR "avm/benchmark" static struct proc_dir_entry *benchprocdir; /*------------------------------------------------------------------------------------------*\ \*------------------------------------------------------------------------------------------*/ static int proc_avm_membench_open(struct inode *inode, struct file *file) { int (*show)(struct seq_file *, void *) = PDE(inode)->data; return single_open(file, show, NULL); } struct _proc_info { const char *name; int (*show)(struct seq_file *, void *); umode_t mode; struct proc_dir_entry *parent; struct proc_dir_entry *pde; struct file_operations fops; } proc_info[] = { { .name = "complete", .show = do_complete_membench, }, { .name = "help", .show = do_help, }, { .name = "do_read_extreme", .show = do_read_extreme, }, { .name = "do_read_pipe", .show = do_read_pipe, }, { .name = "do_read_write", .show = do_read_write, }, { .name = "do_write", .show = do_write, }, { .name = "do_writeburst", .show = do_writeburst, }, { .name = "performance_index", .show = performance_index, }, { }, }; static void show_counter(struct seq_file *m, void *data __maybe_unused) { unsigned long long counter; counter = arm_cpu_cycles(); seq_printf(m, "0x%llx\n", counter); } int __init avm_membench_init(void) { struct _proc_info *pi = proc_info; struct dentry *debugfs_dir; mess_laenge_set(1); benchprocdir = proc_mkdir(PROC_BENCHDIR, NULL); if (benchprocdir == NULL) { pr_err("%s: Unable to create /proc/%s\n", __func__, PROC_BENCHDIR); return -ENOMEM; } while (pi->name) { pi->parent = benchprocdir; if (pi->show) { pi->mode |= 0444; pi->fops = (struct file_operations) { .open = proc_avm_membench_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; } pi->pde = proc_create_data(pi->name, pi->mode, pi->parent, &pi->fops, pi->show); pi++; } proc_symlink("config", benchprocdir, "/sys/kernel/debug/avm-membench/"); debugfs_dir = debugfs_create_dir("avm-membench", NULL); add_simple_debugfs_file("counter", debugfs_dir, NULL, show_counter, NULL); debugfs_create_x32("membench_cpu_clock", 0600, debugfs_dir, (u32 *)&membench_cpu_clock); debugfs_create_x32("membench_bus_clock", 0600, debugfs_dir, (u32 *)&membench_bus_clock); debugfs_create_x64("mess_laenge", 0600, debugfs_dir, &mess_laenge); debugfs_create_u32("loops", 0600, debugfs_dir, &loops); return 0; } /*--------------------------------------------------------------------------------*\ \*--------------------------------------------------------------------------------*/ void __exit avm_membench_exit(void) { struct _proc_info *pi = proc_info; if (unlikely(!benchprocdir)) return; while (pi->pde) remove_proc_entry((pi++)->name, benchprocdir); remove_proc_entry(PROC_BENCHDIR, NULL); benchprocdir = NULL; } module_init(avm_membench_init); module_exit(avm_membench_exit)