Browse Source

[MIPS] R4000/R4400 daddiu erratum workaround

 This complements the generic R4000/R4400 errata workaround code and adds 
bits for the daddiu problem.  In most places it just modifies handwritten 
assembly code so that the assembler is allowed to use a temporary register 
as daddiu may now be treated as a macro that expands to a sequence of li 
and daddu.  It is the AT register or, where AT is unavailable or used 
explicitly for another purpose, an explicitly-named register is selected, 
using the .set at=<reg> feature added recently to gas.  This feature is 
only used if CONFIG_CPU_DADDI_WORKAROUNDS has been set, so if the 
workaround remains disabled, the required version of binutils stays 
unchanged.

 Similarly, daddiu instructions put in branch delay slots in noreorder 
fragments are now taken out of them and the assembler is allowed to 
reorder them itself as possible (which it does making the whole idea of 
scheduling them into delay slots manually questionable).

 Also in the very few places where such a simple conversion was not 
possible, a handcoded longer sequence is implemented.

 Other than that there are changes to code responsible for building the 
TLB fault and page clear/copy handlers to avoid daddiu as appropriate.  
These are only effective if the erratum is verified to be present at the 
run time.

 Finally there is a trivial update to __delay(), because it uses daddiu in 
a branch delay slot.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Maciej W. Rozycki 17 years ago
parent
commit
619b6e18fc

+ 7 - 1
arch/mips/kernel/genex.S

@@ -6,7 +6,7 @@
  * Copyright (C) 1994 - 2000, 2001, 2003 Ralf Baechle
  * Copyright (C) 1994 - 2000, 2001, 2003 Ralf Baechle
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
  * Copyright (C) 2001 MIPS Technologies, Inc.
  * Copyright (C) 2001 MIPS Technologies, Inc.
- * Copyright (C) 2002 Maciej W. Rozycki
+ * Copyright (C) 2002, 2007  Maciej W. Rozycki
  */
  */
 #include <linux/init.h>
 #include <linux/init.h>
 
 
@@ -471,7 +471,13 @@ NESTED(nmi_handler, PT_SIZE, sp)
 	jr	k0
 	jr	k0
 	 rfe
 	 rfe
 #else
 #else
+#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 	LONG_ADDIU	k0, 4		/* stall on $k0 */
 	LONG_ADDIU	k0, 4		/* stall on $k0 */
+#else
+	.set	at=v1
+	LONG_ADDIU	k0, 4
+	.set	noat
+#endif
 	MTC0	k0, CP0_EPC
 	MTC0	k0, CP0_EPC
 	/* I hope three instructions between MTC0 and ERET are enough... */
 	/* I hope three instructions between MTC0 and ERET are enough... */
 	ori	k1, _THREAD_MASK
 	ori	k1, _THREAD_MASK

+ 51 - 10
arch/mips/lib/csum_partial.S

@@ -7,6 +7,7 @@
  *
  *
  * Copyright (C) 1998, 1999 Ralf Baechle
  * Copyright (C) 1998, 1999 Ralf Baechle
  * Copyright (C) 1999 Silicon Graphics, Inc.
  * Copyright (C) 1999 Silicon Graphics, Inc.
+ * Copyright (C) 2007  Maciej W. Rozycki
  */
  */
 #include <linux/errno.h>
 #include <linux/errno.h>
 #include <asm/asm.h>
 #include <asm/asm.h>
@@ -52,9 +53,12 @@
 #define UNIT(unit)  ((unit)*NBYTES)
 #define UNIT(unit)  ((unit)*NBYTES)
 
 
 #define ADDC(sum,reg)						\
 #define ADDC(sum,reg)						\
+	.set	push;						\
+	.set	noat;						\
 	ADD	sum, reg;					\
 	ADD	sum, reg;					\
 	sltu	v1, sum, reg;					\
 	sltu	v1, sum, reg;					\
-	ADD	sum, v1
+	ADD	sum, v1;					\
+	.set	pop
 
 
 #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)	\
 #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)	\
 	LOAD	_t0, (offset + UNIT(0))(src);			\
 	LOAD	_t0, (offset + UNIT(0))(src);			\
@@ -178,8 +182,10 @@ move_128bytes:
 	CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
 	CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
 	CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
 	CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
 	LONG_SUBU	t8, t8, 0x01
 	LONG_SUBU	t8, t8, 0x01
+	.set	reorder				/* DADDI_WAR */
+	PTR_ADDU	src, src, 0x80
 	bnez	t8, move_128bytes
 	bnez	t8, move_128bytes
-	 PTR_ADDU	src, src, 0x80
+	.set	noreorder
 
 
 1:
 1:
 	beqz	t2, 1f
 	beqz	t2, 1f
@@ -208,8 +214,10 @@ end_words:
 	lw	t0, (src)
 	lw	t0, (src)
 	LONG_SUBU	t8, t8, 0x1
 	LONG_SUBU	t8, t8, 0x1
 	ADDC(sum, t0)
 	ADDC(sum, t0)
+	.set	reorder				/* DADDI_WAR */
+	PTR_ADDU	src, src, 0x4
 	bnez	t8, end_words
 	bnez	t8, end_words
-	 PTR_ADDU	src, src, 0x4
+	.set	noreorder
 
 
 /* unknown src alignment and < 8 bytes to go  */
 /* unknown src alignment and < 8 bytes to go  */
 small_csumcpy:
 small_csumcpy:
@@ -246,6 +254,8 @@ small_csumcpy:
 1:	ADDC(sum, t1)
 1:	ADDC(sum, t1)
 
 
 	/* fold checksum */
 	/* fold checksum */
+	.set	push
+	.set	noat
 #ifdef USE_DOUBLE
 #ifdef USE_DOUBLE
 	dsll32	v1, sum, 0
 	dsll32	v1, sum, 0
 	daddu	sum, v1
 	daddu	sum, v1
@@ -266,6 +276,7 @@ small_csumcpy:
 	srl	sum, sum, 8
 	srl	sum, sum, 8
 	or	sum, v1
 	or	sum, v1
 	andi	sum, 0xffff
 	andi	sum, 0xffff
+	.set	pop
 1:
 1:
 	.set	reorder
 	.set	reorder
 	/* Add the passed partial csum.  */
 	/* Add the passed partial csum.  */
@@ -373,7 +384,11 @@ small_csumcpy:
 
 
 #define ADDRMASK (NBYTES-1)
 #define ADDRMASK (NBYTES-1)
 
 
+#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 	.set	noat
 	.set	noat
+#else
+	.set	at=v1
+#endif
 
 
 LEAF(__csum_partial_copy_user)
 LEAF(__csum_partial_copy_user)
 	PTR_ADDU	AT, src, len	/* See (1) above. */
 	PTR_ADDU	AT, src, len	/* See (1) above. */
@@ -441,8 +456,10 @@ EXC(	STORE	t6, UNIT(6)(dst),	s_exc)
 	ADDC(sum, t6)
 	ADDC(sum, t6)
 EXC(	STORE	t7, UNIT(7)(dst),	s_exc)
 EXC(	STORE	t7, UNIT(7)(dst),	s_exc)
 	ADDC(sum, t7)
 	ADDC(sum, t7)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, 8*NBYTES
 	bgez	len, 1b
 	bgez	len, 1b
-	 ADD	dst, dst, 8*NBYTES
+	.set	noreorder
 	ADD	len, 8*NBYTES		# revert len (see above)
 	ADD	len, 8*NBYTES		# revert len (see above)
 
 
 	/*
 	/*
@@ -471,8 +488,10 @@ EXC(	STORE	t2, UNIT(2)(dst),	s_exc)
 	ADDC(sum, t2)
 	ADDC(sum, t2)
 EXC(	STORE	t3, UNIT(3)(dst),	s_exc)
 EXC(	STORE	t3, UNIT(3)(dst),	s_exc)
 	ADDC(sum, t3)
 	ADDC(sum, t3)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, 4*NBYTES
 	beqz	len, done
 	beqz	len, done
-	 ADD	dst, dst, 4*NBYTES
+	.set	noreorder
 less_than_4units:
 less_than_4units:
 	/*
 	/*
 	 * rem = len % NBYTES
 	 * rem = len % NBYTES
@@ -485,8 +504,10 @@ EXC(	LOAD	t0, 0(src),		l_exc)
 	SUB	len, len, NBYTES
 	SUB	len, len, NBYTES
 EXC(	STORE	t0, 0(dst),		s_exc)
 EXC(	STORE	t0, 0(dst),		s_exc)
 	ADDC(sum, t0)
 	ADDC(sum, t0)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, NBYTES
 	bne	rem, len, 1b
 	bne	rem, len, 1b
-	 ADD	dst, dst, NBYTES
+	.set	noreorder
 
 
 	/*
 	/*
 	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
@@ -572,8 +593,10 @@ EXC(	STORE	t2, UNIT(2)(dst),	s_exc)
 	ADDC(sum, t2)
 	ADDC(sum, t2)
 EXC(	STORE	t3, UNIT(3)(dst),	s_exc)
 EXC(	STORE	t3, UNIT(3)(dst),	s_exc)
 	ADDC(sum, t3)
 	ADDC(sum, t3)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, 4*NBYTES
 	bne	len, rem, 1b
 	bne	len, rem, 1b
-	 ADD	dst, dst, 4*NBYTES
+	.set	noreorder
 
 
 cleanup_src_unaligned:
 cleanup_src_unaligned:
 	beqz	len, done
 	beqz	len, done
@@ -587,8 +610,10 @@ EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
 	SUB	len, len, NBYTES
 	SUB	len, len, NBYTES
 EXC(	STORE	t0, 0(dst),		s_exc)
 EXC(	STORE	t0, 0(dst),		s_exc)
 	ADDC(sum, t0)
 	ADDC(sum, t0)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, NBYTES
 	bne	len, rem, 1b
 	bne	len, rem, 1b
-	 ADD	dst, dst, NBYTES
+	.set	noreorder
 
 
 copy_bytes_checklen:
 copy_bytes_checklen:
 	beqz	len, done
 	beqz	len, done
@@ -631,6 +656,8 @@ copy_bytes_done:
 	ADDC(sum, t2)
 	ADDC(sum, t2)
 done:
 done:
 	/* fold checksum */
 	/* fold checksum */
+	.set	push
+	.set	noat
 #ifdef USE_DOUBLE
 #ifdef USE_DOUBLE
 	dsll32	v1, sum, 0
 	dsll32	v1, sum, 0
 	daddu	sum, v1
 	daddu	sum, v1
@@ -651,6 +678,7 @@ done:
 	srl	sum, sum, 8
 	srl	sum, sum, 8
 	or	sum, v1
 	or	sum, v1
 	andi	sum, 0xffff
 	andi	sum, 0xffff
+	.set	pop
 1:
 1:
 	.set reorder
 	.set reorder
 	ADDC(sum, psum)
 	ADDC(sum, psum)
@@ -678,8 +706,10 @@ EXC(	lbu	t1, 0(src),	l_exc)
 	SLLV	t1, t1, t2
 	SLLV	t1, t1, t2
 	addu	t2, SHIFT_INC
 	addu	t2, SHIFT_INC
 	ADDC(sum, t1)
 	ADDC(sum, t1)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, 1
 	bne	src, t0, 1b
 	bne	src, t0, 1b
-	 ADD	dst, dst, 1
+	.set	noreorder
 l_exc:
 l_exc:
 	LOAD	t0, TI_TASK($28)
 	LOAD	t0, TI_TASK($28)
 	 nop
 	 nop
@@ -697,12 +727,22 @@ l_exc:
 	 * Clear len bytes starting at dst.  Can't call __bzero because it
 	 * Clear len bytes starting at dst.  Can't call __bzero because it
 	 * might modify len.  An inefficient loop for these rare times...
 	 * might modify len.  An inefficient loop for these rare times...
 	 */
 	 */
+	.set	reorder				/* DADDI_WAR */
+	SUB	src, len, 1
 	beqz	len, done
 	beqz	len, done
-	 SUB	src, len, 1
+	.set	noreorder
 1:	sb	zero, 0(dst)
 1:	sb	zero, 0(dst)
 	ADD	dst, dst, 1
 	ADD	dst, dst, 1
+	.set	push
+	.set	noat
+#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 	bnez	src, 1b
 	bnez	src, 1b
 	 SUB	src, src, 1
 	 SUB	src, src, 1
+#else
+	li	v1, 1
+	bnez	src, 1b
+	 SUB	src, src, v1
+#endif
 	li	v1, -EFAULT
 	li	v1, -EFAULT
 	b	done
 	b	done
 	 sw	v1, (errptr)
 	 sw	v1, (errptr)
@@ -712,4 +752,5 @@ s_exc:
 	li	v1, -EFAULT
 	li	v1, -EFAULT
 	jr	ra
 	jr	ra
 	 sw	v1, (errptr)
 	 sw	v1, (errptr)
+	.set	pop
 	END(__csum_partial_copy_user)
 	END(__csum_partial_copy_user)

+ 20 - 5
arch/mips/lib/memcpy-inatomic.S

@@ -9,6 +9,7 @@
  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  * Copyright (C) 2002 Broadcom, Inc.
  * Copyright (C) 2002 Broadcom, Inc.
  *   memcpy/copy_user author: Mark Vandevoorde
  *   memcpy/copy_user author: Mark Vandevoorde
+ * Copyright (C) 2007  Maciej W. Rozycki
  *
  *
  * Mnemonic names for arguments to memcpy/__copy_user
  * Mnemonic names for arguments to memcpy/__copy_user
  */
  */
@@ -175,7 +176,11 @@
 
 
 	.text
 	.text
 	.set	noreorder
 	.set	noreorder
+#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 	.set	noat
 	.set	noat
+#else
+	.set	at=v1
+#endif
 
 
 /*
 /*
  * A combined memcpy/__copy_user
  * A combined memcpy/__copy_user
@@ -268,8 +273,10 @@ EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
 	STORE	t1, UNIT(1)(dst)
 	STORE	t1, UNIT(1)(dst)
 	STORE	t2, UNIT(2)(dst)
 	STORE	t2, UNIT(2)(dst)
 	STORE	t3, UNIT(3)(dst)
 	STORE	t3, UNIT(3)(dst)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, 4*NBYTES
 	beqz	len, done
 	beqz	len, done
-	 ADD	dst, dst, 4*NBYTES
+	.set	noreorder
 less_than_4units:
 less_than_4units:
 	/*
 	/*
 	 * rem = len % NBYTES
 	 * rem = len % NBYTES
@@ -281,8 +288,10 @@ EXC(	LOAD	t0, 0(src),		l_exc)
 	ADD	src, src, NBYTES
 	ADD	src, src, NBYTES
 	SUB	len, len, NBYTES
 	SUB	len, len, NBYTES
 	STORE	t0, 0(dst)
 	STORE	t0, 0(dst)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, NBYTES
 	bne	rem, len, 1b
 	bne	rem, len, 1b
-	 ADD	dst, dst, NBYTES
+	.set	noreorder
 
 
 	/*
 	/*
 	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
@@ -361,8 +370,10 @@ EXC(	LDREST	t3, REST(3)(src),	l_exc_copy)
 	STORE	t2, UNIT(2)(dst)
 	STORE	t2, UNIT(2)(dst)
 	STORE	t3, UNIT(3)(dst)
 	STORE	t3, UNIT(3)(dst)
 	PREF(	1, 9*32(dst) )     	# 1 is PREF_STORE (not streamed)
 	PREF(	1, 9*32(dst) )     	# 1 is PREF_STORE (not streamed)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, 4*NBYTES
 	bne	len, rem, 1b
 	bne	len, rem, 1b
-	 ADD	dst, dst, 4*NBYTES
+	.set	noreorder
 
 
 cleanup_src_unaligned:
 cleanup_src_unaligned:
 	beqz	len, done
 	beqz	len, done
@@ -375,8 +386,10 @@ EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
 	ADD	src, src, NBYTES
 	ADD	src, src, NBYTES
 	SUB	len, len, NBYTES
 	SUB	len, len, NBYTES
 	STORE	t0, 0(dst)
 	STORE	t0, 0(dst)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, NBYTES
 	bne	len, rem, 1b
 	bne	len, rem, 1b
-	 ADD	dst, dst, NBYTES
+	.set	noreorder
 
 
 copy_bytes_checklen:
 copy_bytes_checklen:
 	beqz	len, done
 	beqz	len, done
@@ -424,8 +437,10 @@ l_exc_copy:
 EXC(	lb	t1, 0(src),	l_exc)
 EXC(	lb	t1, 0(src),	l_exc)
 	ADD	src, src, 1
 	ADD	src, src, 1
 	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
 	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, 1
 	bne	src, t0, 1b
 	bne	src, t0, 1b
-	 ADD	dst, dst, 1
+	.set	noreorder
 l_exc:
 l_exc:
 	LOAD	t0, TI_TASK($28)
 	LOAD	t0, TI_TASK($28)
 	 nop
 	 nop

+ 47 - 13
arch/mips/lib/memcpy.S

@@ -9,6 +9,7 @@
  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  * Copyright (C) 2002 Broadcom, Inc.
  * Copyright (C) 2002 Broadcom, Inc.
  *   memcpy/copy_user author: Mark Vandevoorde
  *   memcpy/copy_user author: Mark Vandevoorde
+ * Copyright (C) 2007  Maciej W. Rozycki
  *
  *
  * Mnemonic names for arguments to memcpy/__copy_user
  * Mnemonic names for arguments to memcpy/__copy_user
  */
  */
@@ -175,7 +176,11 @@
 
 
 	.text
 	.text
 	.set	noreorder
 	.set	noreorder
+#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 	.set	noat
 	.set	noat
+#else
+	.set	at=v1
+#endif
 
 
 /*
 /*
  * A combined memcpy/__copy_user
  * A combined memcpy/__copy_user
@@ -271,8 +276,10 @@ EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
 EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
 EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, 4*NBYTES
 	beqz	len, done
 	beqz	len, done
-	 ADD	dst, dst, 4*NBYTES
+	.set	noreorder
 less_than_4units:
 less_than_4units:
 	/*
 	/*
 	 * rem = len % NBYTES
 	 * rem = len % NBYTES
@@ -284,8 +291,10 @@ EXC(	LOAD	t0, 0(src),		l_exc)
 	ADD	src, src, NBYTES
 	ADD	src, src, NBYTES
 	SUB	len, len, NBYTES
 	SUB	len, len, NBYTES
 EXC(	STORE	t0, 0(dst),		s_exc_p1u)
 EXC(	STORE	t0, 0(dst),		s_exc_p1u)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, NBYTES
 	bne	rem, len, 1b
 	bne	rem, len, 1b
-	 ADD	dst, dst, NBYTES
+	.set	noreorder
 
 
 	/*
 	/*
 	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
@@ -364,8 +373,10 @@ EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
 	PREF(	1, 9*32(dst) )     	# 1 is PREF_STORE (not streamed)
 	PREF(	1, 9*32(dst) )     	# 1 is PREF_STORE (not streamed)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, 4*NBYTES
 	bne	len, rem, 1b
 	bne	len, rem, 1b
-	 ADD	dst, dst, 4*NBYTES
+	.set	noreorder
 
 
 cleanup_src_unaligned:
 cleanup_src_unaligned:
 	beqz	len, done
 	beqz	len, done
@@ -378,8 +389,10 @@ EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
 	ADD	src, src, NBYTES
 	ADD	src, src, NBYTES
 	SUB	len, len, NBYTES
 	SUB	len, len, NBYTES
 EXC(	STORE	t0, 0(dst),		s_exc_p1u)
 EXC(	STORE	t0, 0(dst),		s_exc_p1u)
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, NBYTES
 	bne	len, rem, 1b
 	bne	len, rem, 1b
-	 ADD	dst, dst, NBYTES
+	.set	noreorder
 
 
 copy_bytes_checklen:
 copy_bytes_checklen:
 	beqz	len, done
 	beqz	len, done
@@ -427,8 +440,10 @@ l_exc_copy:
 EXC(	lb	t1, 0(src),	l_exc)
 EXC(	lb	t1, 0(src),	l_exc)
 	ADD	src, src, 1
 	ADD	src, src, 1
 	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
 	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
+	.set	reorder				/* DADDI_WAR */
+	ADD	dst, dst, 1
 	bne	src, t0, 1b
 	bne	src, t0, 1b
-	 ADD	dst, dst, 1
+	.set	noreorder
 l_exc:
 l_exc:
 	LOAD	t0, TI_TASK($28)
 	LOAD	t0, TI_TASK($28)
 	 nop
 	 nop
@@ -446,20 +461,33 @@ l_exc:
 	 * Clear len bytes starting at dst.  Can't call __bzero because it
 	 * Clear len bytes starting at dst.  Can't call __bzero because it
 	 * might modify len.  An inefficient loop for these rare times...
 	 * might modify len.  An inefficient loop for these rare times...
 	 */
 	 */
+	.set	reorder				/* DADDI_WAR */
+	SUB	src, len, 1
 	beqz	len, done
 	beqz	len, done
-	 SUB	src, len, 1
+	.set	noreorder
 1:	sb	zero, 0(dst)
 1:	sb	zero, 0(dst)
 	ADD	dst, dst, 1
 	ADD	dst, dst, 1
+#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 	bnez	src, 1b
 	bnez	src, 1b
 	 SUB	src, src, 1
 	 SUB	src, src, 1
+#else
+	.set	push
+	.set	noat
+	li	v1, 1
+	bnez	src, 1b
+	 SUB	src, src, v1
+	.set	pop
+#endif
 	jr	ra
 	jr	ra
 	 nop
 	 nop
 
 
 
 
-#define SEXC(n)				\
-s_exc_p ## n ## u:			\
-	jr	ra;			\
-	 ADD	len, len, n*NBYTES
+#define SEXC(n)							\
+	.set	reorder;			/* DADDI_WAR */	\
+s_exc_p ## n ## u:						\
+	ADD	len, len, n*NBYTES;				\
+	jr	ra;						\
+	.set	noreorder
 
 
 SEXC(8)
 SEXC(8)
 SEXC(7)
 SEXC(7)
@@ -471,8 +499,10 @@ SEXC(2)
 SEXC(1)
 SEXC(1)
 
 
 s_exc_p1:
 s_exc_p1:
+	.set	reorder				/* DADDI_WAR */
+	ADD	len, len, 1
 	jr	ra
 	jr	ra
-	 ADD	len, len, 1
+	.set	noreorder
 s_exc:
 s_exc:
 	jr	ra
 	jr	ra
 	 nop
 	 nop
@@ -502,8 +532,10 @@ r_end_bytes:
 	SUB	a2, a2, 0x1
 	SUB	a2, a2, 0x1
 	sb	t0, -1(a0)
 	sb	t0, -1(a0)
 	SUB	a1, a1, 0x1
 	SUB	a1, a1, 0x1
+	.set	reorder				/* DADDI_WAR */
+	SUB	a0, a0, 0x1
 	bnez	a2, r_end_bytes
 	bnez	a2, r_end_bytes
-	 SUB	a0, a0, 0x1
+	.set	noreorder
 
 
 r_out:
 r_out:
 	jr	ra
 	jr	ra
@@ -514,8 +546,10 @@ r_end_bytes_up:
 	SUB	a2, a2, 0x1
 	SUB	a2, a2, 0x1
 	sb	t0, (a0)
 	sb	t0, (a0)
 	ADD	a1, a1, 0x1
 	ADD	a1, a1, 0x1
+	.set	reorder				/* DADDI_WAR */
+	ADD	a0, a0, 0x1
 	bnez	a2, r_end_bytes_up
 	bnez	a2, r_end_bytes_up
-	 ADD	a0, a0, 0x1
+	.set	noreorder
 
 
 	jr	ra
 	jr	ra
 	 move	a2, zero
 	 move	a2, zero

+ 10 - 1
arch/mips/lib/memset.S

@@ -5,6 +5,7 @@
  *
  *
  * Copyright (C) 1998, 1999, 2000 by Ralf Baechle
  * Copyright (C) 1998, 1999, 2000 by Ralf Baechle
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
+ * Copyright (C) 2007  Maciej W. Rozycki
  */
  */
 #include <asm/asm.h>
 #include <asm/asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/asm-offsets.h>
@@ -74,8 +75,16 @@ FEXPORT(__bzero)
 	bnez		t0, small_memset
 	bnez		t0, small_memset
 	 andi		t0, a0, LONGMASK	/* aligned? */
 	 andi		t0, a0, LONGMASK	/* aligned? */
 
 
+#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 	beqz		t0, 1f
 	beqz		t0, 1f
 	 PTR_SUBU	t0, LONGSIZE		/* alignment in bytes */
 	 PTR_SUBU	t0, LONGSIZE		/* alignment in bytes */
+#else
+	.set		noat
+	li		AT, LONGSIZE
+	beqz		t0, 1f
+	 PTR_SUBU	t0, AT			/* alignment in bytes */
+	.set		at
+#endif
 
 
 #ifdef __MIPSEB__
 #ifdef __MIPSEB__
 	EX(LONG_S_L, a1, (a0), first_fixup)	/* make word/dword aligned */
 	EX(LONG_S_L, a1, (a0), first_fixup)	/* make word/dword aligned */
@@ -106,7 +115,7 @@ memset_partial:
 	.set		noat
 	.set		noat
 	LONG_SRL		AT, t0, 1
 	LONG_SRL		AT, t0, 1
 	PTR_SUBU	t1, AT
 	PTR_SUBU	t1, AT
-	.set		noat
+	.set		at
 #endif
 #endif
 	jr		t1
 	jr		t1
 	 PTR_ADDU	a0, t0			/* dest ptr */
 	 PTR_ADDU	a0, t0			/* dest ptr */

+ 2 - 2
arch/mips/lib/strncpy_user.S

@@ -41,9 +41,9 @@ FEXPORT(__strncpy_from_user_nocheck_asm)
 	beqz		t0, 2f
 	beqz		t0, 2f
 	 sb		t0, (a0)
 	 sb		t0, (a0)
 	PTR_ADDIU	v0, 1
 	PTR_ADDIU	v0, 1
-	bne		v0, a2, 1b
-	 PTR_ADDIU	a0, 1
 	.set		reorder
 	.set		reorder
+	PTR_ADDIU	a0, 1
+	bne		v0, a2, 1b
 2:	PTR_ADDU	t0, a1, v0
 2:	PTR_ADDU	t0, a1, v0
 	xor		t0, a1
 	xor		t0, a1
 	bltz		t0, fault
 	bltz		t0, fault

+ 31 - 35
arch/mips/mm/pg-r4k.c

@@ -4,6 +4,7 @@
  * for more details.
  * for more details.
  *
  *
  * Copyright (C) 2003, 04, 05 Ralf Baechle (ralf@linux-mips.org)
  * Copyright (C) 2003, 04, 05 Ralf Baechle (ralf@linux-mips.org)
+ * Copyright (C) 2007  Maciej W. Rozycki
  */
  */
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kernel.h>
@@ -12,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/proc_fs.h>
 
 
+#include <asm/bugs.h>
 #include <asm/cacheops.h>
 #include <asm/cacheops.h>
 #include <asm/inst.h>
 #include <asm/inst.h>
 #include <asm/io.h>
 #include <asm/io.h>
@@ -255,64 +257,58 @@ static inline void build_store_reg(int reg)
 	__build_store_reg(reg);
 	__build_store_reg(reg);
 }
 }
 
 
-static inline void build_addiu_a2_a0(unsigned long offset)
+static inline void build_addiu_rt_rs(unsigned int rt, unsigned int rs,
+				     unsigned long offset)
 {
 {
 	union mips_instruction mi;
 	union mips_instruction mi;
 
 
 	BUG_ON(offset > 0x7fff);
 	BUG_ON(offset > 0x7fff);
 
 
-	mi.i_format.opcode     = cpu_has_64bit_gp_regs ? daddiu_op : addiu_op;
-	mi.i_format.rs         = 4;		/* $a0 */
-	mi.i_format.rt         = 6;		/* $a2 */
-	mi.i_format.simmediate = offset;
+	if (cpu_has_64bit_gp_regs && DADDI_WAR && r4k_daddiu_bug()) {
+		mi.i_format.opcode     = addiu_op;
+		mi.i_format.rs         = 0;	/* $zero */
+		mi.i_format.rt         = 25;	/* $t9 */
+		mi.i_format.simmediate = offset;
+		emit_instruction(mi);
 
 
+		mi.r_format.opcode     = spec_op;
+		mi.r_format.rs         = rs;
+		mi.r_format.rt         = 25;	/* $t9 */
+		mi.r_format.rd         = rt;
+		mi.r_format.re         = 0;
+		mi.r_format.func       = daddu_op;
+	} else {
+		mi.i_format.opcode     = cpu_has_64bit_gp_regs ?
+					 daddiu_op : addiu_op;
+		mi.i_format.rs         = rs;
+		mi.i_format.rt         = rt;
+		mi.i_format.simmediate = offset;
+	}
 	emit_instruction(mi);
 	emit_instruction(mi);
 }
 }
 
 
-static inline void build_addiu_a2(unsigned long offset)
+static inline void build_addiu_a2_a0(unsigned long offset)
 {
 {
-	union mips_instruction mi;
-
-	BUG_ON(offset > 0x7fff);
-
-	mi.i_format.opcode     = cpu_has_64bit_gp_regs ? daddiu_op : addiu_op;
-	mi.i_format.rs         = 6;		/* $a2 */
-	mi.i_format.rt         = 6;		/* $a2 */
-	mi.i_format.simmediate = offset;
+	build_addiu_rt_rs(6, 4, offset);	/* $a2, $a0, offset */
+}
 
 
-	emit_instruction(mi);
+static inline void build_addiu_a2(unsigned long offset)
+{
+	build_addiu_rt_rs(6, 6, offset);	/* $a2, $a2, offset */
 }
 }
 
 
 static inline void build_addiu_a1(unsigned long offset)
 static inline void build_addiu_a1(unsigned long offset)
 {
 {
-	union mips_instruction mi;
-
-	BUG_ON(offset > 0x7fff);
-
-	mi.i_format.opcode     = cpu_has_64bit_gp_regs ? daddiu_op : addiu_op;
-	mi.i_format.rs         = 5;		/* $a1 */
-	mi.i_format.rt         = 5;		/* $a1 */
-	mi.i_format.simmediate = offset;
+	build_addiu_rt_rs(5, 5, offset);	/* $a1, $a1, offset */
 
 
 	load_offset -= offset;
 	load_offset -= offset;
-
-	emit_instruction(mi);
 }
 }
 
 
 static inline void build_addiu_a0(unsigned long offset)
 static inline void build_addiu_a0(unsigned long offset)
 {
 {
-	union mips_instruction mi;
-
-	BUG_ON(offset > 0x7fff);
-
-	mi.i_format.opcode     = cpu_has_64bit_gp_regs ? daddiu_op : addiu_op;
-	mi.i_format.rs         = 4;		/* $a0 */
-	mi.i_format.rt         = 4;		/* $a0 */
-	mi.i_format.simmediate = offset;
+	build_addiu_rt_rs(4, 4, offset);	/* $a0, $a0, offset */
 
 
 	store_offset -= offset;
 	store_offset -= offset;
-
-	emit_instruction(mi);
 }
 }
 
 
 static inline void build_bne(unsigned int *dest)
 static inline void build_bne(unsigned int *dest)

+ 30 - 12
arch/mips/mm/tlbex.c

@@ -6,7 +6,7 @@
  * Synthesize TLB refill handlers at runtime.
  * Synthesize TLB refill handlers at runtime.
  *
  *
  * Copyright (C) 2004,2005,2006 by Thiemo Seufer
  * Copyright (C) 2004,2005,2006 by Thiemo Seufer
- * Copyright (C) 2005  Maciej W. Rozycki
+ * Copyright (C) 2005, 2007  Maciej W. Rozycki
  * Copyright (C) 2006  Ralf Baechle (ralf@linux-mips.org)
  * Copyright (C) 2006  Ralf Baechle (ralf@linux-mips.org)
  *
  *
  * ... and the days got worse and worse and now you see
  * ... and the days got worse and worse and now you see
@@ -27,6 +27,7 @@
 #include <linux/string.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/init.h>
 
 
+#include <asm/bugs.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
@@ -293,7 +294,7 @@ static void __init build_insn(u32 **buf, enum opcode opc, ...)
 			break;
 			break;
 		}
 		}
 
 
-	if (!ip)
+	if (!ip || (opc == insn_daddiu && r4k_daddiu_bug()))
 		panic("Unsupported TLB synthesizer instruction %d", opc);
 		panic("Unsupported TLB synthesizer instruction %d", opc);
 
 
 	op = ip->match;
 	op = ip->match;
@@ -525,23 +526,33 @@ L_LA(_r3000_write_probe_fail)
 #define i_ssnop(buf) i_sll(buf, 0, 0, 1)
 #define i_ssnop(buf) i_sll(buf, 0, 0, 1)
 #define i_ehb(buf) i_sll(buf, 0, 0, 3)
 #define i_ehb(buf) i_sll(buf, 0, 0, 3)
 
 
-#ifdef CONFIG_64BIT
 static __init int __maybe_unused in_compat_space_p(long addr)
 static __init int __maybe_unused in_compat_space_p(long addr)
 {
 {
 	/* Is this address in 32bit compat space? */
 	/* Is this address in 32bit compat space? */
+#ifdef CONFIG_64BIT
 	return (((addr) & 0xffffffff00000000L) == 0xffffffff00000000L);
 	return (((addr) & 0xffffffff00000000L) == 0xffffffff00000000L);
+#else
+	return 1;
+#endif
 }
 }
 
 
 static __init int __maybe_unused rel_highest(long val)
 static __init int __maybe_unused rel_highest(long val)
 {
 {
+#ifdef CONFIG_64BIT
 	return ((((val + 0x800080008000L) >> 48) & 0xffff) ^ 0x8000) - 0x8000;
 	return ((((val + 0x800080008000L) >> 48) & 0xffff) ^ 0x8000) - 0x8000;
+#else
+	return 0;
+#endif
 }
 }
 
 
 static __init int __maybe_unused rel_higher(long val)
 static __init int __maybe_unused rel_higher(long val)
 {
 {
+#ifdef CONFIG_64BIT
 	return ((((val + 0x80008000L) >> 32) & 0xffff) ^ 0x8000) - 0x8000;
 	return ((((val + 0x80008000L) >> 32) & 0xffff) ^ 0x8000) - 0x8000;
-}
+#else
+	return 0;
 #endif
 #endif
+}
 
 
 static __init int rel_hi(long val)
 static __init int rel_hi(long val)
 {
 {
@@ -555,7 +566,6 @@ static __init int rel_lo(long val)
 
 
 static __init void i_LA_mostly(u32 **buf, unsigned int rs, long addr)
 static __init void i_LA_mostly(u32 **buf, unsigned int rs, long addr)
 {
 {
-#ifdef CONFIG_64BIT
 	if (!in_compat_space_p(addr)) {
 	if (!in_compat_space_p(addr)) {
 		i_lui(buf, rs, rel_highest(addr));
 		i_lui(buf, rs, rel_highest(addr));
 		if (rel_higher(addr))
 		if (rel_higher(addr))
@@ -567,16 +577,18 @@ static __init void i_LA_mostly(u32 **buf, unsigned int rs, long addr)
 		} else
 		} else
 			i_dsll32(buf, rs, rs, 0);
 			i_dsll32(buf, rs, rs, 0);
 	} else
 	} else
-#endif
 		i_lui(buf, rs, rel_hi(addr));
 		i_lui(buf, rs, rel_hi(addr));
 }
 }
 
 
-static __init void __maybe_unused i_LA(u32 **buf, unsigned int rs,
-					     long addr)
+static __init void __maybe_unused i_LA(u32 **buf, unsigned int rs, long addr)
 {
 {
 	i_LA_mostly(buf, rs, addr);
 	i_LA_mostly(buf, rs, addr);
-	if (rel_lo(addr))
-		i_ADDIU(buf, rs, rs, rel_lo(addr));
+	if (rel_lo(addr)) {
+		if (!in_compat_space_p(addr))
+			i_daddiu(buf, rs, rs, rel_lo(addr));
+		else
+			i_addiu(buf, rs, rs, rel_lo(addr));
+	}
 }
 }
 
 
 /*
 /*
@@ -1085,7 +1097,10 @@ build_get_pgd_vmalloc64(u32 **p, struct label **l, struct reloc **r,
 	} else {
 	} else {
 		i_LA_mostly(p, ptr, modd);
 		i_LA_mostly(p, ptr, modd);
 		il_b(p, r, label_vmalloc_done);
 		il_b(p, r, label_vmalloc_done);
-		i_daddiu(p, ptr, ptr, rel_lo(modd));
+		if (in_compat_space_p(modd))
+			i_addiu(p, ptr, ptr, rel_lo(modd));
+		else
+			i_daddiu(p, ptr, ptr, rel_lo(modd));
 	}
 	}
 
 
 	l_vmalloc(l, *p);
 	l_vmalloc(l, *p);
@@ -1106,7 +1121,10 @@ build_get_pgd_vmalloc64(u32 **p, struct label **l, struct reloc **r,
 	} else {
 	} else {
 		i_LA_mostly(p, ptr, swpd);
 		i_LA_mostly(p, ptr, swpd);
 		il_b(p, r, label_vmalloc_done);
 		il_b(p, r, label_vmalloc_done);
-		i_daddiu(p, ptr, ptr, rel_lo(swpd));
+		if (in_compat_space_p(swpd))
+			i_addiu(p, ptr, ptr, rel_lo(swpd));
+		else
+			i_daddiu(p, ptr, ptr, rel_lo(swpd));
 	}
 	}
 }
 }
 
 

+ 10 - 1
include/asm-mips/delay.h

@@ -28,7 +28,7 @@ static inline void __delay(unsigned long loops)
 		"	.set	reorder					\n"
 		"	.set	reorder					\n"
 		: "=r" (loops)
 		: "=r" (loops)
 		: "0" (loops));
 		: "0" (loops));
-	else if (sizeof(long) == 8)
+	else if (sizeof(long) == 8 && !DADDI_WAR)
 		__asm__ __volatile__ (
 		__asm__ __volatile__ (
 		"	.set	noreorder				\n"
 		"	.set	noreorder				\n"
 		"	.align	3					\n"
 		"	.align	3					\n"
@@ -37,6 +37,15 @@ static inline void __delay(unsigned long loops)
 		"	.set	reorder					\n"
 		"	.set	reorder					\n"
 		: "=r" (loops)
 		: "=r" (loops)
 		: "0" (loops));
 		: "0" (loops));
+	else if (sizeof(long) == 8 && DADDI_WAR)
+		__asm__ __volatile__ (
+		"	.set	noreorder				\n"
+		"	.align	3					\n"
+		"1:	bnez	%0, 1b					\n"
+		"	dsubu	%0, %2					\n"
+		"	.set	reorder					\n"
+		: "=r" (loops)
+		: "0" (loops), "r" (1));
 }
 }
 
 
 
 

+ 9 - 0
include/asm-mips/stackframe.h

@@ -6,6 +6,7 @@
  * Copyright (C) 1994, 95, 96, 99, 2001 Ralf Baechle
  * Copyright (C) 1994, 95, 96, 99, 2001 Ralf Baechle
  * Copyright (C) 1994, 1995, 1996 Paul M. Antoine.
  * Copyright (C) 1994, 1995, 1996 Paul M. Antoine.
  * Copyright (C) 1999 Silicon Graphics, Inc.
  * Copyright (C) 1999 Silicon Graphics, Inc.
+ * Copyright (C) 2007  Maciej W. Rozycki
  */
  */
 #ifndef _ASM_STACKFRAME_H
 #ifndef _ASM_STACKFRAME_H
 #define _ASM_STACKFRAME_H
 #define _ASM_STACKFRAME_H
@@ -145,8 +146,16 @@
 		.set	reorder
 		.set	reorder
 		/* Called from user mode, new stack. */
 		/* Called from user mode, new stack. */
 		get_saved_sp
 		get_saved_sp
+#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 8:		move	k0, sp
 8:		move	k0, sp
 		PTR_SUBU sp, k1, PT_SIZE
 		PTR_SUBU sp, k1, PT_SIZE
+#else
+		.set	at=k0
+8:		PTR_SUBU k1, PT_SIZE
+		.set	noat
+		move	k0, sp
+		move	sp, k1
+#endif
 		LONG_S	k0, PT_R29(sp)
 		LONG_S	k0, PT_R29(sp)
 		LONG_S	$3, PT_R3(sp)
 		LONG_S	$3, PT_R3(sp)
 		/*
 		/*

+ 10 - 3
include/asm-mips/uaccess.h

@@ -5,6 +5,7 @@
  *
  *
  * Copyright (C) 1996, 1997, 1998, 1999, 2000, 03, 04 by Ralf Baechle
  * Copyright (C) 1996, 1997, 1998, 1999, 2000, 03, 04 by Ralf Baechle
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
+ * Copyright (C) 2007  Maciej W. Rozycki
  */
  */
 #ifndef _ASM_UACCESS_H
 #ifndef _ASM_UACCESS_H
 #define _ASM_UACCESS_H
 #define _ASM_UACCESS_H
@@ -387,6 +388,12 @@ extern void __put_user_unknown(void);
 	"jal\t" #destination "\n\t"
 	"jal\t" #destination "\n\t"
 #endif
 #endif
 
 
+#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
+#define DADDI_SCRATCH "$0"
+#else
+#define DADDI_SCRATCH "$3"
+#endif
+
 extern size_t __copy_user(void *__to, const void *__from, size_t __n);
 extern size_t __copy_user(void *__to, const void *__from, size_t __n);
 
 
 #define __invoke_copy_to_user(to, from, n)				\
 #define __invoke_copy_to_user(to, from, n)				\
@@ -403,7 +410,7 @@ extern size_t __copy_user(void *__to, const void *__from, size_t __n);
 	: "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)	\
 	: "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)	\
 	:								\
 	:								\
 	: "$8", "$9", "$10", "$11", "$12", "$15", "$24", "$31",		\
 	: "$8", "$9", "$10", "$11", "$12", "$15", "$24", "$31",		\
-	  "memory");							\
+	  DADDI_SCRATCH, "memory");					\
 	__cu_len_r;							\
 	__cu_len_r;							\
 })
 })
 
 
@@ -512,7 +519,7 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
 	: "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)	\
 	: "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)	\
 	:								\
 	:								\
 	: "$8", "$9", "$10", "$11", "$12", "$15", "$24", "$31",		\
 	: "$8", "$9", "$10", "$11", "$12", "$15", "$24", "$31",		\
-	  "memory");							\
+	  DADDI_SCRATCH, "memory");					\
 	__cu_len_r;							\
 	__cu_len_r;							\
 })
 })
 
 
@@ -535,7 +542,7 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
 	: "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)	\
 	: "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)	\
 	:								\
 	:								\
 	: "$8", "$9", "$10", "$11", "$12", "$15", "$24", "$31",		\
 	: "$8", "$9", "$10", "$11", "$12", "$15", "$24", "$31",		\
-	  "memory");							\
+	  DADDI_SCRATCH, "memory");					\
 	__cu_len_r;							\
 	__cu_len_r;							\
 })
 })