|
@@ -1470,7 +1470,10 @@ STD_ENTRY(_sys_clone)
|
|
|
* We place it in the __HEAD section to ensure it is relatively
|
|
|
* near to the intvec_SWINT_1 code (reachable by a conditional branch).
|
|
|
*
|
|
|
- * Must match register usage in do_page_fault().
|
|
|
+ * Our use of ATOMIC_LOCK_REG here must match do_page_fault_ics().
|
|
|
+ *
|
|
|
+ * As we do in lib/atomic_asm_32.S, we bypass a store if the value we
|
|
|
+ * would store is the same as the value we just loaded.
|
|
|
*/
|
|
|
__HEAD
|
|
|
.align 64
|
|
@@ -1531,17 +1534,7 @@ ENTRY(sys_cmpxchg)
|
|
|
{
|
|
|
shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
|
|
|
slt_u r23, r0, r23
|
|
|
-
|
|
|
- /*
|
|
|
- * Ensure that the TLB is loaded before we take out the lock.
|
|
|
- * On TILEPro, this will start fetching the value all the way
|
|
|
- * into our L1 as well (and if it gets modified before we
|
|
|
- * grab the lock, it will be invalidated from our cache
|
|
|
- * before we reload it). On tile64, we'll start fetching it
|
|
|
- * into our L1 if we're the home, and if we're not, we'll
|
|
|
- * still at least start fetching it into the home's L2.
|
|
|
- */
|
|
|
- lw r26, r0
|
|
|
+ lw r26, r0 /* see comment in the "#else" for the "lw r26". */
|
|
|
}
|
|
|
{
|
|
|
s2a r21, r20, r21
|
|
@@ -1557,18 +1550,9 @@ ENTRY(sys_cmpxchg)
|
|
|
bbs r23, .Lcmpxchg64
|
|
|
andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */
|
|
|
}
|
|
|
-
|
|
|
{
|
|
|
- /*
|
|
|
- * We very carefully align the code that actually runs with
|
|
|
- * the lock held (nine bundles) so that we know it is all in
|
|
|
- * the icache when we start. This instruction (the jump) is
|
|
|
- * at the start of the first cache line, address zero mod 64;
|
|
|
- * we jump to somewhere in the second cache line to issue the
|
|
|
- * tns, then jump back to finish up.
|
|
|
- */
|
|
|
s2a ATOMIC_LOCK_REG_NAME, r25, r21
|
|
|
- j .Lcmpxchg32_tns
|
|
|
+ j .Lcmpxchg32_tns /* see comment in the #else for the jump. */
|
|
|
}
|
|
|
|
|
|
#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
|
|
@@ -1633,24 +1617,25 @@ ENTRY(sys_cmpxchg)
|
|
|
{
|
|
|
/*
|
|
|
* We very carefully align the code that actually runs with
|
|
|
- * the lock held (nine bundles) so that we know it is all in
|
|
|
+ * the lock held (twelve bundles) so that we know it is all in
|
|
|
* the icache when we start. This instruction (the jump) is
|
|
|
* at the start of the first cache line, address zero mod 64;
|
|
|
- * we jump to somewhere in the second cache line to issue the
|
|
|
- * tns, then jump back to finish up.
|
|
|
+ * we jump to the very end of the second cache line to get that
|
|
|
+ * line loaded in the icache, then fall through to issue the tns
|
|
|
+ * in the third cache line, at which point it's all cached.
|
|
|
+ * Note that is for performance, not correctness.
|
|
|
*/
|
|
|
j .Lcmpxchg32_tns
|
|
|
}
|
|
|
|
|
|
#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
|
|
|
|
|
|
- ENTRY(__sys_cmpxchg_grab_lock)
|
|
|
+/* Symbol for do_page_fault_ics() to use to compare against the PC. */
|
|
|
+.global __sys_cmpxchg_grab_lock
|
|
|
+__sys_cmpxchg_grab_lock:
|
|
|
|
|
|
/*
|
|
|
* Perform the actual cmpxchg or atomic_update.
|
|
|
- * Note that the system <arch/atomic.h> header relies on
|
|
|
- * atomic_update() to always perform an "mf", so don't make
|
|
|
- * it optional or conditional without modifying that code.
|
|
|
*/
|
|
|
.Ldo_cmpxchg32:
|
|
|
{
|
|
@@ -1668,10 +1653,13 @@ ENTRY(sys_cmpxchg)
|
|
|
}
|
|
|
{
|
|
|
mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */
|
|
|
- bbns r22, .Lcmpxchg32_mismatch
|
|
|
+ bbns r22, .Lcmpxchg32_nostore
|
|
|
}
|
|
|
+ seq r22, r24, r21 /* Are we storing the value we loaded? */
|
|
|
+ bbs r22, .Lcmpxchg32_nostore
|
|
|
sw r0, r24
|
|
|
|
|
|
+ /* The following instruction is the start of the second cache line. */
|
|
|
/* Do slow mtspr here so the following "mf" waits less. */
|
|
|
{
|
|
|
move sp, r27
|
|
@@ -1679,7 +1667,6 @@ ENTRY(sys_cmpxchg)
|
|
|
}
|
|
|
mf
|
|
|
|
|
|
- /* The following instruction is the start of the second cache line. */
|
|
|
{
|
|
|
move r0, r21
|
|
|
sw ATOMIC_LOCK_REG_NAME, zero
|
|
@@ -1687,7 +1674,7 @@ ENTRY(sys_cmpxchg)
|
|
|
iret
|
|
|
|
|
|
/* Duplicated code here in the case where we don't overlap "mf" */
|
|
|
-.Lcmpxchg32_mismatch:
|
|
|
+.Lcmpxchg32_nostore:
|
|
|
{
|
|
|
move r0, r21
|
|
|
sw ATOMIC_LOCK_REG_NAME, zero
|
|
@@ -1703,8 +1690,6 @@ ENTRY(sys_cmpxchg)
|
|
|
* and for 64-bit cmpxchg. We provide it as a macro and put
|
|
|
* it into both versions. We can't share the code literally
|
|
|
* since it depends on having the right branch-back address.
|
|
|
- * Note that the first few instructions should share the cache
|
|
|
- * line with the second half of the actual locked code.
|
|
|
*/
|
|
|
.macro cmpxchg_lock, bitwidth
|
|
|
|
|
@@ -1730,7 +1715,7 @@ ENTRY(sys_cmpxchg)
|
|
|
}
|
|
|
/*
|
|
|
* The preceding instruction is the last thing that must be
|
|
|
- * on the second cache line.
|
|
|
+ * hot in the icache before we do the "tns" above.
|
|
|
*/
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
@@ -1761,6 +1746,12 @@ ENTRY(sys_cmpxchg)
|
|
|
.endm
|
|
|
|
|
|
.Lcmpxchg32_tns:
|
|
|
+ /*
|
|
|
+ * This is the last instruction on the second cache line.
|
|
|
+ * The nop here loads the second line, then we fall through
|
|
|
+ * to the tns to load the third line before we take the lock.
|
|
|
+ */
|
|
|
+ nop
|
|
|
cmpxchg_lock 32
|
|
|
|
|
|
/*
|