From a36bdfe69f2356727df6e85e6d4c1307f2ae8f56 Mon Sep 17 00:00:00 2001
From: Nathan Hjelm <hjelmn@lanl.gov>
Date: Fri, 2 Sep 2016 23:47:47 -0600
Subject: [PATCH] opal/asm: updates to powerpc assembly

This commit contains the following changes:

 - There is a bug in the PGI 16.x betas for ppc64 that causes them to
   emit the incorrect instruction for loading 64-bit operands. If not
   cast to void * the operands are loaded with lwz (load word and
   zero) instead of ld. This does not affect optimized mode. The work
   around is to cast to void * and was implemented similar to a
   work-around for a xlc bug.

 - Actually implement 64-bit add/sub. These functions were missing and
   fell back to the less efficient compare-and-swap implementations.

Thanks to @PHHargrove for helping to track this down. With this update
the GCC inline assembly works as expected with pgi and ppc64.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
---
 opal/include/opal/sys/powerpc/atomic.h | 61 ++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 8 deletions(-)

diff --git a/opal/include/opal/sys/powerpc/atomic.h b/opal/include/opal/sys/powerpc/atomic.h
index d10437948d..745248aa0e 100644
--- a/opal/include/opal/sys/powerpc/atomic.h
+++ b/opal/include/opal/sys/powerpc/atomic.h
@@ -11,7 +11,7 @@
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2010      IBM Corporation.  All rights reserved.
- * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
  *                         reserved.
  * $COPYRIGHT$
  *
@@ -55,6 +55,9 @@
 #define OPAL_HAVE_ATOMIC_CMPSET_64 1
 #define OPAL_HAVE_ATOMIC_SWAP_64 1
 #define OPAL_HAVE_ATOMIC_LLSC_64 1
+#define OPAL_HAVE_ATOMIC_MATH_64 1
+#define OPAL_HAVE_ATOMIC_ADD_64 1
+#define OPAL_HAVE_ATOMIC_SUB_64 1
 #endif
 
 
@@ -128,6 +131,16 @@ void opal_atomic_isync(void)
 #define OPAL_ASM_ADDR(a) (a)
 #endif
 
+#if defined(__PGI)
+/* work-around for bug in PGI 16.5-16.7 where the compiler fails to
+ * correctly emit load instructions for 64-bit operands. without this
+ * it will emit lwz instead of ld to load the 64-bit operand. */
+#define OPAL_ASM_VALUE64(x) (void *)(intptr_t) (x)
+#else
+#define OPAL_ASM_VALUE64(x) x
+#endif
+
+
 static inline int opal_atomic_cmpset_32(volatile int32_t *addr,
                                         int32_t oldval, int32_t newval)
 {
@@ -217,6 +230,38 @@ static inline int32_t opal_atomic_swap_32(volatile int32_t *addr, int32_t newval
 #if (OPAL_ASSEMBLY_ARCH == OPAL_POWERPC64)
 
 #if  OPAL_GCC_INLINE_ASSEMBLY
+static inline int64_t opal_atomic_add_64 (volatile int64_t* v, int64_t inc)
+{
+   int64_t t;
+
+   __asm__ __volatile__("1:   ldarx   %0, 0, %3    \n\t"
+                        "     add     %0, %2, %0   \n\t"
+                        "     stdcx.  %0, 0, %3    \n\t"
+                        "     bne-    1b           \n\t"
+                        : "=&r" (t), "+m" (*v)
+                        : "r" (OPAL_ASM_VALUE64(inc)), "r" OPAL_ASM_ADDR(v)
+                        : "cc");
+
+   return t;
+}
+
+
+static inline int64_t opal_atomic_sub_64 (volatile int64_t* v, int64_t dec)
+{
+   int64_t t;
+
+   __asm__ __volatile__(
+                        "1:   ldarx   %0,0,%3      \n\t"
+                        "     subf    %0,%2,%0     \n\t"
+                        "     stdcx.  %0,0,%3      \n\t"
+                        "     bne-    1b           \n\t"
+                        : "=&r" (t), "+m" (*v)
+                        : "r" (OPAL_ASM_VALUE64(dec)), "r" OPAL_ASM_ADDR(v)
+                        : "cc");
+
+   return t;
+}
+
 static inline int opal_atomic_cmpset_64(volatile int64_t *addr,
                                         int64_t oldval, int64_t newval)
 {
@@ -229,8 +274,8 @@ static inline int opal_atomic_cmpset_64(volatile int64_t *addr,
                          "   stdcx.  %4, 0, %2  \n\t"
                          "   bne-    1b         \n\t"
                          "2:"
-                         : "=&r" (ret), "=m" (*addr)
-                         : "r" (addr), "r" (oldval), "r" (newval), "m" (*addr)
+                         : "=&r" (ret), "+m" (*addr)
+                         : "r" (addr), "r" (OPAL_ASM_VALUE64(oldval)), "r" (OPAL_ASM_VALUE64(newval))
                          : "cc", "memory");
 
    return (ret == oldval);
@@ -249,15 +294,15 @@ static inline int64_t opal_atomic_ll_64(volatile int64_t *addr)
 
 static inline int opal_atomic_sc_64(volatile int64_t *addr, int64_t newval)
 {
-    int32_t ret, foo;
+    int32_t ret;
 
-    __asm__ __volatile__ ("   stdcx.  %4, 0, %3  \n\t"
+    __asm__ __volatile__ ("   stdcx.  %2, 0, %1  \n\t"
                           "   li      %0,0       \n\t"
                           "   bne-    1f         \n\t"
                           "   ori     %0,%0,1    \n\t"
                           "1:"
-                          : "=r" (ret), "=m" (*addr), "=r" (foo)
-                          : "r" (addr), "r" (newval)
+                          : "=r" (ret)
+                          : "r" (addr), "r" (OPAL_ASM_VALUE64(newval))
                           : "cc", "memory");
     return ret;
 }
@@ -294,7 +339,7 @@ static inline int64_t opal_atomic_swap_64(volatile int64_t *addr, int64_t newval
                          "   stdcx.  %3, 0, %2  \n\t"
                          "   bne-    1b         \n\t"
                          : "=&r" (ret), "=m" (*addr)
-                         : "r" (addr), "r" (newval)
+                         : "r" (addr), "r" (OPAL_ASM_VALUE64(newval))
                          : "cc", "memory");
 
    return ret;