14 years ago · df29ccb6c0
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -1470,7 +1470,10 @@ STD_ENTRY(_sys_clone)
 
				  * We place it in the __HEAD section to ensure it is relatively
			
 
				  * near to the intvec_SWINT_1 code (reachable by a conditional branch).
			
 
				  *
			
 
				- * Must match register usage in do_page_fault().
			
 
				+ * Our use of ATOMIC_LOCK_REG here must match do_page_fault_ics().
			
 
				+ *
			
 
				+ * As we do in lib/atomic_asm_32.S, we bypass a store if the value we
			
 
				+ * would store is the same as the value we just loaded.
			
 
				  */
			
 
				 	__HEAD
			
 
				 	.align 64
			
@@ -1531,17 +1534,7 @@ ENTRY(sys_cmpxchg)
 
				 	{
			
 
				 	 shri	r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
			
 
				 	 slt_u  r23, r0, r23
			
 
				-
			
 
				-	 /*
			
 
				-	  * Ensure that the TLB is loaded before we take out the lock.
			
 
				-	  * On TILEPro, this will start fetching the value all the way
			
 
				-	  * into our L1 as well (and if it gets modified before we
			
 
				-	  * grab the lock, it will be invalidated from our cache
			
 
				-	  * before we reload it).  On tile64, we'll start fetching it
			
 
				-	  * into our L1 if we're the home, and if we're not, we'll
			
 
				-	  * still at least start fetching it into the home's L2.
			
 
				-	  */
			
 
				-	 lw	r26, r0
			
 
				+	 lw	r26, r0  /* see comment in the "#else" for the "lw r26". */
			
 
				 	}
			
 
				 	{
			
 
				 	 s2a    r21, r20, r21
			
@@ -1557,18 +1550,9 @@ ENTRY(sys_cmpxchg)
 
				 	 bbs    r23, .Lcmpxchg64
			
 
				 	 andi   r23, r0, 7       /* Precompute alignment for cmpxchg64. */
			
 
				 	}
			
 
				-
			
 
				 	{
			
 
				-	 /*
			
 
				-	  * We very carefully align the code that actually runs with
			
 
				-	  * the lock held (nine bundles) so that we know it is all in
			
 
				-	  * the icache when we start.  This instruction (the jump) is
			
 
				-	  * at the start of the first cache line, address zero mod 64;
			
 
				-	  * we jump to somewhere in the second cache line to issue the
			
 
				-	  * tns, then jump back to finish up.
			
 
				-	  */
			
 
				 	 s2a	ATOMIC_LOCK_REG_NAME, r25, r21
			
 
				-	 j      .Lcmpxchg32_tns
			
 
				+	 j      .Lcmpxchg32_tns   /* see comment in the #else for the jump. */
			
 
				 	}
			
 
				 
			
 
				 #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
			
@@ -1633,24 +1617,25 @@ ENTRY(sys_cmpxchg)
 
				 	{
			
 
				 	 /*
			
 
				 	  * We very carefully align the code that actually runs with
			
 
				-	  * the lock held (nine bundles) so that we know it is all in
			
 
				+	  * the lock held (twelve bundles) so that we know it is all in
			
 
				 	  * the icache when we start.  This instruction (the jump) is
			
 
				 	  * at the start of the first cache line, address zero mod 64;
			
 
				-	  * we jump to somewhere in the second cache line to issue the
			
 
				-	  * tns, then jump back to finish up.
			
 
				+	  * we jump to the very end of the second cache line to get that
			
 
				+	  * line loaded in the icache, then fall through to issue the tns
			
 
				+	  * in the third cache line, at which point it's all cached.
			
 
				+	  * Note that is for performance, not correctness.
			
 
				 	  */
			
 
				 	 j      .Lcmpxchg32_tns
			
 
				 	}
			
 
				 
			
 
				 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
			
 
				 
			
 
				-	ENTRY(__sys_cmpxchg_grab_lock)
			
 
				+/* Symbol for do_page_fault_ics() to use to compare against the PC. */
			
 
				+.global __sys_cmpxchg_grab_lock
			
 
				+__sys_cmpxchg_grab_lock:
			
 
				 
			
 
				 	/*
			
 
				 	 * Perform the actual cmpxchg or atomic_update.
			
 
				-	 * Note that the system <arch/atomic.h> header relies on
			
 
				-	 * atomic_update() to always perform an "mf", so don't make
			
 
				-	 * it optional or conditional without modifying that code.
			
 
				 	 */
			
 
				 .Ldo_cmpxchg32:
			
 
				 	{
			
@@ -1668,10 +1653,13 @@ ENTRY(sys_cmpxchg)
 
				 	}
			
 
				 	{
			
 
				 	 mvnz	r24, r23, r25    /* Use atomic_update value if appropriate. */
			
 
				-	 bbns   r22, .Lcmpxchg32_mismatch
			
 
				+	 bbns   r22, .Lcmpxchg32_nostore
			
 
				 	}
			
 
				+	seq     r22, r24, r21    /* Are we storing the value we loaded? */
			
 
				+	bbs     r22, .Lcmpxchg32_nostore
			
 
				 	sw      r0, r24
			
 
				 
			
 
				+	/* The following instruction is the start of the second cache line. */
			
 
				 	/* Do slow mtspr here so the following "mf" waits less. */
			
 
				 	{
			
 
				 	 move   sp, r27
			
@@ -1679,7 +1667,6 @@ ENTRY(sys_cmpxchg)
 
				 	}
			
 
				 	mf
			
 
				 
			
 
				-	/* The following instruction is the start of the second cache line. */
			
 
				 	{
			
 
				 	 move   r0, r21
			
 
				 	 sw     ATOMIC_LOCK_REG_NAME, zero
			
@@ -1687,7 +1674,7 @@ ENTRY(sys_cmpxchg)
 
				 	iret
			
 
				 
			
 
				 	/* Duplicated code here in the case where we don't overlap "mf" */
			
 
				-.Lcmpxchg32_mismatch:
			
 
				+.Lcmpxchg32_nostore:
			
 
				 	{
			
 
				 	 move   r0, r21
			
 
				 	 sw     ATOMIC_LOCK_REG_NAME, zero
			
@@ -1703,8 +1690,6 @@ ENTRY(sys_cmpxchg)
 
				 	 * and for 64-bit cmpxchg.  We provide it as a macro and put
			
 
				 	 * it into both versions.  We can't share the code literally
			
 
				 	 * since it depends on having the right branch-back address.
			
 
				-	 * Note that the first few instructions should share the cache
			
 
				-	 * line with the second half of the actual locked code.
			
 
				 	 */
			
 
				 	.macro  cmpxchg_lock, bitwidth
			
 
				 
			
@@ -1730,7 +1715,7 @@ ENTRY(sys_cmpxchg)
 
				 	}
			
 
				 	/*
			
 
				 	 * The preceding instruction is the last thing that must be
			
 
				-	 * on the second cache line.
			
 
				+	 * hot in the icache before we do the "tns" above.
			
 
				 	 */
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
@@ -1761,6 +1746,12 @@ ENTRY(sys_cmpxchg)
 
				 	.endm
			
 
				 
			
 
				 .Lcmpxchg32_tns:
			
 
				+	/*
			
 
				+	 * This is the last instruction on the second cache line.
			
 
				+	 * The nop here loads the second line, then we fall through
			
 
				+	 * to the tns to load the third line before we take the lock.
			
 
				+	 */
			
 
				+	nop
			
 
				 	cmpxchg_lock 32
			
 
				 
			
 
				 	/*
			
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -59,7 +59,7 @@
 
				  * bad kernel addresses).
			
 
				  *
			
 
				  * Note that if the value we would store is the same as what we
			
 
				- * loaded, we bypass the load.  Other platforms with true atomics can
			
 
				+ * loaded, we bypass the store.  Other platforms with true atomics can
			
 
				  * make the guarantee that a non-atomic __clear_bit(), for example,
			
 
				  * can safely race with an atomic test_and_set_bit(); this example is
			
 
				  * from bit_spinlock.h in slub_lock() / slub_unlock().  We can't do