--- zzzz-none-000/linux-2.6.28.10/arch/mips/lib/memcpy.S 2009-05-02 18:54:43.000000000 +0000 +++ puma5-6360-529/linux-2.6.28.10/arch/mips/lib/memcpy.S 2009-10-13 15:55:55.000000000 +0000 @@ -28,6 +28,7 @@ #undef CONFIG_CPU_HAS_PREFETCH #endif +#include #include #include #include @@ -312,6 +313,12 @@ * more instruction-level parallelism. */ #define bits t2 +#ifdef CONFIG_MACH_FUSIV_MIPS1 + beqz len, .Ldone + nop + j .Lcopy_bytes + nop +#else beqz len, .Ldone ADD t1, dst, len # t1 is just past last byte of dst li bits, 8*NBYTES @@ -322,6 +329,7 @@ EXC( STREST t0, -1(t1), .Ls_exc) jr ra move len, zero +#endif .Ldst_unaligned: /* * dst is unaligned @@ -333,6 +341,27 @@ * Set match = (src and dst have same alignment) */ #define match rem +#ifdef CONFIG_MACH_FUSIV_MIPS1 +#define COPY_BYTE1(src,dst) \ +EXC( lbu t4, 0(src), .Ll_exc); \ + SUB len, len, 1; \ +EXC( sb t4, 0(dst), .Ls_exc_p1) \ + ADD src, src, 1; \ + beqz len, .Ldone; \ + ADD dst, dst,1; \ + + ADD t2, zero,NBYTES + SUB t3, t2, t1 +.Lcopy1: + COPY_BYTE1(src,dst) + SUB t3, t3 , 1 + bne t3, zero, .Lcopy1 + nop + + xor match, t0, t1 + beqz match, .Lboth_aligned + nop +#else EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc) ADD t2, zero, NBYTES EXC( LDREST t3, REST(0)(src), .Ll_exc_copy) @@ -345,7 +374,7 @@ ADD dst, dst, t2 beqz match, .Lboth_aligned ADD src, src, t2 - +#endif .Lsrc_unaligned_dst_aligned: SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter PREF( 0, 3*32(src) ) @@ -360,6 +389,54 @@ * are to the same unit (unless src is aligned, but it's not). */ R10KCBARRIER(0(ra)) +#ifdef CONFIG_MACH_FUSIV_MIPS1 +EXC( lbu t0, 0(src), .Ll_exc) +EXC( lbu t4, 1(src), .Ll_exc_copy) +EXC( lbu t5, 2(src), .Ll_exc_copy) +EXC( lbu t6, 3(src), .Ll_exc_copy) + sll t0 , 8 + or t0, t4 + sll t0, 8 + or t0, t5 + sll t0, 8 + or t0, t6 + ADD src, src, 4 +EXC( lbu t1, 0(src), .Ll_exc_copy) +EXC( lbu t4, 1(src), .Ll_exc_copy) +EXC( lbu t5, 2(src), .Ll_exc_copy) +EXC( lbu t6, 3(src), .Ll_exc_copy) + sll t1 , 8 + or t1, t4 + sll t1, 8 + or t1, t5 + sll t1, 8 + or t1, t6 + ADD src, src, 4 +EXC( lbu t2, 0(src), .Ll_exc_copy) +EXC( lbu t4, 1(src), .Ll_exc_copy) +EXC( lbu t5, 2(src), .Ll_exc_copy) +EXC( lbu t6, 3(src), .Ll_exc_copy) + sll t2 , 8 + or t2, t4 + sll t2, 8 + or t2, t5 + sll t2, 8 + or t2, t6 + ADD src, src, 4 +EXC( lbu t3, 0(src), .Ll_exc_copy) +EXC( lbu t4, 1(src), .Ll_exc_copy) +EXC( lbu t5, 2(src), .Ll_exc_copy) +EXC( lbu t6, 3(src), .Ll_exc_copy) + sll t3 , 8 + or t3, t4 + sll t3, 8 + or t3, t5 + sll t3, 8 + or t3, t6 + ADD src, src, 4 + + SUB len, len, 4*NBYTES +#else /* FUSIV MIPS1 */ EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy) SUB len, len, 4*NBYTES @@ -374,6 +451,7 @@ #ifdef CONFIG_CPU_SB1 nop # improves slotting #endif +#endif /* FUSIV MIPS1 */ EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u) EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u) EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u) @@ -391,8 +469,21 @@ nop 1: R10KCBARRIER(0(ra)) +#ifdef CONFIG_MACH_FUSIV_MIPS1 +EXC( lbu t0, 0(src), .Ll_exc) +EXC( lbu t4, 1(src), .Ll_exc_copy) +EXC( lbu t5, 2(src), .Ll_exc_copy) +EXC( lbu t6, 3(src), .Ll_exc_copy) + sll t0 , 8 + or t0, t4 + sll t0, 8 + or t0, t5 + sll t0, 8 + or t0, t6 +#else /* FUSIV MIPS1 */ EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) +#endif /* FUSIV MIPS1 */ ADD src, src, NBYTES SUB len, len, NBYTES EXC( STORE t0, 0(dst), .Ls_exc_p1u)