diff --git a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl index 84560beda9d6e..1cf3fb8750c2e 100644 --- a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl +++ b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl @@ -9,14 +9,36 @@ #include #include +int __clc_nvvm_reflect_arch(); + _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory, unsigned int semantics) { - if (memory == CrossDevice) { - __nvvm_membar_sys(); - } else if (memory == Device) { - __nvvm_membar_gl(); - } else { - __nvvm_membar_cta(); + + // for sm_70 and above membar becomes semantically identical to fence.sc. + // However sm_70 and above also introduces a lightweight fence.acq_rel that + // can be used to form either acquire or release strong operations. + // Consult + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence + // for details. + + unsigned int order = semantics & 0x1F; + if (__clc_nvvm_reflect_arch() < 700 || + order == SequentiallyConsistent) { + if (memory == CrossDevice) { + __nvvm_membar_sys(); + } else if (memory == Device) { + __nvvm_membar_gl(); + } else { + __nvvm_membar_cta(); + } + } else if (order != None) { + if (memory == CrossDevice) { + __asm__ __volatile__("fence.acq_rel.sys;"); + } else if (memory == Device) { + __asm__ __volatile__("fence.acq_rel.gpu;"); + } else { + __asm__ __volatile__("fence.acq_rel.cta;"); + } } }