/*
 * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

#ifndef _CUDA_PIPELINE
#define _CUDA_PIPELINE

#include <cuda/barrier>
#include <cuda/atomic>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

    template<thread_scope _Scope>
    class pipeline;

    enum class pipeline_role {
        producer,
        consumer
    };

    template<thread_scope _Scope>
    struct __pipeline_stage {
        barrier<_Scope> __produced;
        barrier<_Scope> __consumed;
    };

    template<thread_scope _Scope, uint8_t _Stages_count>
    class pipeline_shared_state {
    public:
        pipeline_shared_state() = default;
        pipeline_shared_state(const pipeline_shared_state &) = delete;
        pipeline_shared_state(pipeline_shared_state &&) = delete;
        pipeline_shared_state & operator=(pipeline_shared_state &&) = delete;
        pipeline_shared_state & operator=(const pipeline_shared_state &) =  delete;

    private:
        __pipeline_stage<_Scope> __stages[_Stages_count];
        atomic<uint32_t, _Scope> __refcount;

        template<thread_scope _Pipeline_scope>
        friend class pipeline;

        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
        friend _LIBCUDACXX_INLINE_VISIBILITY
        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state);

        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
        friend _LIBCUDACXX_INLINE_VISIBILITY
        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, size_t __producer_count);
     
        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
        friend _LIBCUDACXX_INLINE_VISIBILITY
        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, pipeline_role __role);
    };

    struct __pipeline_asm_helper {
        __device__
        static inline uint32_t __lane_id()
        {
            uint32_t __lane_id;
            asm volatile ("mov.u32 %0, %%laneid;" : "=r"(__lane_id));
            return __lane_id;
        }
    };

    template<thread_scope _Scope>
    class pipeline {
    public:
        pipeline(pipeline &&) = default;
        pipeline(const pipeline &) = delete;
        pipeline & operator=(pipeline &&) = delete;
        pipeline & operator=(const pipeline &) = delete;

        _LIBCUDACXX_INLINE_VISIBILITY
        ~pipeline()
        {
            if (__active) {
                (void)quit();
            }
        }

        _LIBCUDACXX_INLINE_VISIBILITY
        bool quit()
        {
#ifdef __CUDA_ARCH__
            const uint32_t __match_mask = __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
            const uint32_t __elected_id = __ffs(__match_mask) - 1;
            const bool __elected = (__pipeline_asm_helper::__lane_id() == __elected_id);
            const uint32_t __sub_count = __popc(__match_mask);
#else
            const bool __elected = true;
            const uint32_t __sub_count = 1;
#endif
            bool __released = false;
            if (__elected) {
                const uint32_t __old = __shared_state_get_refcount()->fetch_sub(__sub_count);
                const bool __last = (__old == __sub_count);
                if (__last) {
                    for (uint8_t __stage = 0; __stage < __stages_count; ++__stage) {
                        __shared_state_get_stage(__stage)->__produced.~barrier();
                        if (__partitioned) {
                            __shared_state_get_stage(__stage)->__consumed.~barrier();
                        }
                    }
                    __released = true;
                }
            }
            __active = false;
            return __released;
        }

        _LIBCUDACXX_INLINE_VISIBILITY
        void producer_acquire()
        {
            if (__partitioned) {
                barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__head)->__consumed;
                (void)_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(__stage_barrier, __consumed_phase_parity));
            }
        }

        _LIBCUDACXX_INLINE_VISIBILITY
        void producer_commit()
        {
            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__head)->__produced;
            __memcpy_async_synchronize(__stage_barrier, true);
            (void)__stage_barrier.arrive();
            if (++__head == __stages_count) {
                __head = 0;
                if (__partitioned) {
                    __consumed_phase_parity = !__consumed_phase_parity;
                }
            }
        }

        _LIBCUDACXX_INLINE_VISIBILITY
        void consumer_wait()
        {
            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__tail)->__produced;
            (void)_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(__stage_barrier, __produced_phase_parity));
        }

        _LIBCUDACXX_INLINE_VISIBILITY
        void consumer_release()
        {
            if (__partitioned) {
                (void)__shared_state_get_stage(__tail)->__consumed.arrive();
            }
            if (++__tail == __stages_count) {
                __tail = 0;
                __produced_phase_parity = !__produced_phase_parity;
            }
        }

        template<class _Rep, class _Period>
        _LIBCUDACXX_INLINE_VISIBILITY
        bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period> & __duration)
        {
            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__tail)->__produced;
            return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
                __poll_tester(__stage_barrier, __produced_phase_parity),
                _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__duration)
            );
        }

        template<class _Clock, class _Duration> 
        _LIBCUDACXX_INLINE_VISIBILITY
        bool consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration> & __time_point)
        {
            return consumer_wait_for(__time_point - _Clock::now());
        }

    private:
        uint8_t __head               : 8;
        uint8_t __tail               : 8;
        const uint8_t __stages_count : 8;
        bool __consumed_phase_parity : 1;
        bool __produced_phase_parity : 1;
        bool __active                : 1;
        const bool __partitioned     : 1;
        char * const __shared_state;

        _LIBCUDACXX_INLINE_VISIBILITY
        pipeline(char * __shared_state, uint8_t __stages_count, bool __partitioned)
            : __head(0)
            , __tail(0)
            , __stages_count(__stages_count)
            , __consumed_phase_parity(true)
            , __produced_phase_parity(false)
            , __active(true)
            , __partitioned(__partitioned)
            , __shared_state(__shared_state)
        {}

        _LIBCUDACXX_INLINE_VISIBILITY
        static bool __barrier_try_wait_parity_impl(barrier<_Scope> & __barrier, bool __phase_parity)
        {
            typename barrier<_Scope>::arrival_token __synthesized_token = (__phase_parity ? 1ull : 0ull) << 63;
            return __barrier.__try_wait(_CUDA_VSTD::move(__synthesized_token));
        }

        _LIBCUDACXX_INLINE_VISIBILITY
        static bool __barrier_try_wait_parity(barrier<_Scope> & __barrier, bool __phase_parity)
        {
            return __barrier_try_wait_parity_impl(__barrier, __phase_parity);
        }

        struct __poll_tester {
            barrier<_Scope> & __barrier;
            bool __phase_parity;

            _LIBCUDACXX_INLINE_VISIBILITY
            __poll_tester(barrier<_Scope> & __barrier, bool __phase_parity)
              : __barrier(__barrier)
              , __phase_parity(__phase_parity)
            {}

            _LIBCUDACXX_INLINE_VISIBILITY
            bool operator()() const
            {
                return __barrier_try_wait_parity(__barrier, __phase_parity);
            }
        };

        _LIBCUDACXX_INLINE_VISIBILITY
        __pipeline_stage<_Scope> * __shared_state_get_stage(uint8_t __stage)
        {
            ptrdiff_t __stage_offset = __stage * sizeof(__pipeline_stage<_Scope>);
            return reinterpret_cast<__pipeline_stage<_Scope>*>(__shared_state + __stage_offset);
        }

        _LIBCUDACXX_INLINE_VISIBILITY
        atomic<uint32_t, _Scope> * __shared_state_get_refcount()
        {
            ptrdiff_t __refcount_offset = __stages_count * sizeof(__pipeline_stage<_Scope>);
            return reinterpret_cast<atomic<uint32_t, _Scope>*>(__shared_state + __refcount_offset);
        }

        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
        friend _LIBCUDACXX_INLINE_VISIBILITY
        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state);

        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
        friend _LIBCUDACXX_INLINE_VISIBILITY
        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, size_t __producer_count);
     
        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
        friend _LIBCUDACXX_INLINE_VISIBILITY
        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, pipeline_role __role);
    };

    template<>
    _LIBCUDACXX_INLINE_VISIBILITY
    bool pipeline<thread_scope_block>::__barrier_try_wait_parity(barrier<thread_scope_block> & __barrier, bool __phase_parity)
    {
#if __CUDA_ARCH__ >= 800
        if (__isShared(&__barrier)) {
            uint64_t * __mbarrier = device::barrier_native_handle(__barrier);
            uint16_t __wait_complete;

            asm volatile ("{"
                          "     .reg .pred %p;"
                          "     mbarrier.test_wait.parity.shared.b64 %p, [%1], %2;"
                          "     selp.u16 %0, 1, 0, %p;"
                          "}"
                : "=h"(__wait_complete)
                : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(__mbarrier))), "r"(static_cast<uint32_t>(__phase_parity))
                : "memory");

            return bool(__wait_complete);
        } else
#endif
        {
            return __barrier_try_wait_parity_impl(__barrier, __phase_parity);
        }
    }

    template<class _Group, thread_scope _Scope, uint8_t _Stages_count>
    _LIBCUDACXX_INLINE_VISIBILITY
    pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state)
    {
        const uint32_t __group_size = static_cast<uint32_t>(__group.size());
        const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());

        if (__thread_rank == 0) {
            for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) {
                init(&__shared_state->__stages[__stage].__produced, __group_size);
            }
            __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
        }
        __group.sync();

        return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, false);
    }

    template<class _Group, thread_scope _Scope, uint8_t _Stages_count>
    _LIBCUDACXX_INLINE_VISIBILITY
    pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state, size_t __producer_count)
    {
        const uint32_t __group_size = static_cast<uint32_t>(__group.size());
        const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());

        if (__thread_rank == 0) {
            const size_t __consumer_count = __group_size - __producer_count;
            for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) {
                init(&__shared_state->__stages[__stage].__consumed, __consumer_count);
                init(&__shared_state->__stages[__stage].__produced, __producer_count);
            }
            __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
        }
        __group.sync();

        return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true);
    }
 
    template<class _Group, thread_scope _Scope, uint8_t _Stages_count>
    _LIBCUDACXX_INLINE_VISIBILITY
    pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state, pipeline_role __role)
    {
        const uint32_t __group_size = static_cast<uint32_t>(__group.size());
        const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());

        if (__thread_rank == 0) {
            __shared_state->__refcount.store(0, std::memory_order_relaxed);
        }
        __group.sync();

        if (__role == pipeline_role::producer) {
#ifdef __CUDA_ARCH__
            const uint32_t __match_mask = __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
            const uint32_t __elected_id = __ffs(__match_mask) - 1;
            const bool __elected = (__pipeline_asm_helper::__lane_id() == __elected_id);
            const uint32_t __add_count = __popc(__match_mask);
#else
            const bool __elected = true;
            const uint32_t __add_count = 1;
#endif
            if (__elected) {
                (void)__shared_state->__refcount.fetch_add(__add_count, std::memory_order_relaxed);
            }
        }
        __group.sync();

        if (__thread_rank == 0) {
            const uint32_t __producer_count = __shared_state->__refcount.load(std::memory_order_relaxed);
            const uint32_t __consumer_count = __group_size - __producer_count;
            for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) {
                init(&__shared_state->__stages[__stage].__consumed, __consumer_count);
                init(&__shared_state->__stages[__stage].__produced, __producer_count);
            }
            __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
        }
        __group.sync();

        return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true);
    }

_LIBCUDACXX_END_NAMESPACE_CUDA

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE

    template<uint8_t _Prior>
    __device__
    void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline);

    __device__
    inline void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline, uint8_t __prior);

_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

    template<>
    class pipeline<thread_scope_thread> {
    public:
        pipeline(pipeline &&) = default;
        pipeline(const pipeline &) = delete;
        pipeline & operator=(pipeline &&) = delete;
        pipeline & operator=(const pipeline &) = delete;

        _LIBCUDACXX_INLINE_VISIBILITY
        ~pipeline() {}

        _LIBCUDACXX_INLINE_VISIBILITY
        bool quit()
        {
            return true;
        }

        _LIBCUDACXX_INLINE_VISIBILITY
        void producer_acquire() {}

        _LIBCUDACXX_INLINE_VISIBILITY
        void producer_commit()
        {
#if __CUDA_ARCH__ >= 800
            asm volatile ("cp.async.commit_group;");
            ++__head;
#endif
        }

        _LIBCUDACXX_INLINE_VISIBILITY
        void consumer_wait()
        {
#if __CUDA_ARCH__ >= 800
            if (__head == __tail) {
                return;
            }

            const uint8_t __prior = __head - __tail - 1;
            device::__pipeline_consumer_wait(*this, __prior);
            ++__tail;
#endif
        }

        _LIBCUDACXX_INLINE_VISIBILITY
        void consumer_release() {}
    
        template<class _Rep, class _Period>
        _LIBCUDACXX_INLINE_VISIBILITY
        bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period> & __duration)
        {
            (void)__duration;
            consumer_wait();
            return true;
        }

        template<class _Clock, class _Duration> 
        _LIBCUDACXX_INLINE_VISIBILITY
        bool consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration> & __time_point)
        {
            (void)__time_point;
            consumer_wait();
            return true;
        }

    private:
        uint8_t __head;
        uint8_t __tail;

        _LIBCUDACXX_INLINE_VISIBILITY
        pipeline()
            : __head(0)
            , __tail(0)
        {}

        friend _LIBCUDACXX_INLINE_VISIBILITY inline pipeline<thread_scope_thread> make_pipeline();

        template<uint8_t _Prior>
        friend _LIBCUDACXX_INLINE_VISIBILITY
        void pipeline_consumer_wait_prior(pipeline<thread_scope_thread> & __pipeline);
    };

_LIBCUDACXX_END_NAMESPACE_CUDA

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE

    template<uint8_t _Prior>
    __device__
    void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline)
    {
        (void)__pipeline;
#if __CUDA_ARCH__ >= 800
        constexpr uint8_t __max_prior = 8;

        asm volatile ("cp.async.wait_group %0;"
            :
            : "n"(_Prior < __max_prior ? _Prior : __max_prior));
#endif
    }

    __device__
    inline void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline, uint8_t __prior)
    {
        switch (__prior) {
        case 0:  device::__pipeline_consumer_wait<0>(__pipeline); break;
        case 1:  device::__pipeline_consumer_wait<1>(__pipeline); break;
        case 2:  device::__pipeline_consumer_wait<2>(__pipeline); break;
        case 3:  device::__pipeline_consumer_wait<3>(__pipeline); break;
        case 4:  device::__pipeline_consumer_wait<4>(__pipeline); break;
        case 5:  device::__pipeline_consumer_wait<5>(__pipeline); break;
        case 6:  device::__pipeline_consumer_wait<6>(__pipeline); break;
        case 7:  device::__pipeline_consumer_wait<7>(__pipeline); break;
        default: device::__pipeline_consumer_wait<8>(__pipeline); break;
        }
    }

_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

    _LIBCUDACXX_INLINE_VISIBILITY
    inline pipeline<thread_scope_thread> make_pipeline()
    {
        return pipeline<thread_scope_thread>();
    }

    template<uint8_t _Prior>
    _LIBCUDACXX_INLINE_VISIBILITY
    void pipeline_consumer_wait_prior(pipeline<thread_scope_thread> & __pipeline)
    {
#if __CUDA_ARCH__ >= 800
        device::__pipeline_consumer_wait<_Prior>(__pipeline);
        __pipeline.__tail = __pipeline.__head - _Prior;
#endif
    }

    template<thread_scope _Scope>
    _LIBCUDACXX_INLINE_VISIBILITY
    void pipeline_producer_commit(pipeline<thread_scope_thread> & __pipeline, barrier<_Scope> & __barrier)
    {
        (void)__pipeline;
#if __CUDA_ARCH__ >= 800
        __memcpy_async_synchronize(__barrier, true);
#endif
    }

    template<thread_scope _Scope>
    _LIBCUDACXX_INLINE_VISIBILITY
    void __memcpy_async_synchronize(pipeline<_Scope> & __pipeline, bool __is_async) {
        // memcpy_async submissions never synchronize on their own in the pipeline path.
        (void)__pipeline;
        (void)__is_async;
    }

    template<typename _Group, class _Type, thread_scope _Scope>
    _LIBCUDACXX_INLINE_VISIBILITY
    void memcpy_async(_Group const & __group, _Type * __destination, _Type const * __source, std::size_t __size, pipeline<_Scope> & __pipeline)
    {
        // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are
        // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with
        // memcpy_async when compiling with GCC 4.8.
        // FIXME: remove the #if once GCC 4.8 is no longer supported.
    #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408
        static_assert(_CUDA_VSTD::is_trivially_copyable<_Type>::value, "memcpy_async requires a trivially copyable type");
    #endif

        __memcpy_async<alignof(_Type)>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline);
    }

    template<typename _Group, class _Type, std::size_t _Alignment, thread_scope _Scope, std::size_t _Larger_alignment = (alignof(_Type) > _Alignment) ? alignof(_Type) : _Alignment>
    _LIBCUDACXX_INLINE_VISIBILITY
    void memcpy_async(_Group const & __group, _Type * __destination, _Type const * __source, aligned_size_t<_Alignment> __size, pipeline<_Scope> & __pipeline) {
        // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are
        // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with
        // memcpy_async when compiling with GCC 4.8.
        // FIXME: remove the #if once GCC 4.8 is no longer supported.
    #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408
        static_assert(_CUDA_VSTD::is_trivially_copyable<_Type>::value, "memcpy_async requires a trivially copyable type");
    #endif

        __memcpy_async<_Larger_alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline);
    }

    template<class _Type, typename _Size, thread_scope _Scope>
    _LIBCUDACXX_INLINE_VISIBILITY
    void memcpy_async(_Type * __destination, _Type const * __source, _Size __size, pipeline<_Scope> & __pipeline) {
        memcpy_async(__single_thread_group{}, __destination, __source, __size, __pipeline);
    }

    template<typename _Group, thread_scope _Scope>
    _LIBCUDACXX_INLINE_VISIBILITY
    void memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, pipeline<_Scope> & __pipeline) {
        __memcpy_async<1>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline);
    }

    template<typename _Group, std::size_t _Alignment, thread_scope _Scope>
    _LIBCUDACXX_INLINE_VISIBILITY
    void memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, pipeline<_Scope> & __pipeline) {
        __memcpy_async<_Alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline);
    }

    template<typename _Size, thread_scope _Scope>
    _LIBCUDACXX_INLINE_VISIBILITY
    void memcpy_async(void * __destination, void const * __source, _Size __size, pipeline<_Scope> & __pipeline) {
        memcpy_async(__single_thread_group{}, __destination, __source, __size, __pipeline);
    }

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif //_CUDA_PIPELINE
