//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _LIBCUDACXX_ALGORITHM
#define _LIBCUDACXX_ALGORITHM

/*
    algorithm synopsis

#include <initializer_list>

namespace std
{

template <class InputIterator, class Predicate>
    constexpr bool     // constexpr in C++20
    all_of(InputIterator first, InputIterator last, Predicate pred);

template <class InputIterator, class Predicate>
    constexpr bool     // constexpr in C++20
    any_of(InputIterator first, InputIterator last, Predicate pred);

template <class InputIterator, class Predicate>
    constexpr bool     // constexpr in C++20
    none_of(InputIterator first, InputIterator last, Predicate pred);

template <class InputIterator, class Function>
    constexpr Function          // constexpr in C++20
    for_each(InputIterator first, InputIterator last, Function f);

template<class InputIterator, class Size, class Function>
    constexpr InputIterator     // constexpr in C++20
    for_each_n(InputIterator first, Size n, Function f); // C++17

template <class InputIterator, class T>
    constexpr InputIterator     // constexpr in C++20
    find(InputIterator first, InputIterator last, const T& value);

template <class InputIterator, class Predicate>
    constexpr InputIterator     // constexpr in C++20
    find_if(InputIterator first, InputIterator last, Predicate pred);

template<class InputIterator, class Predicate>
    InputIterator               // constexpr in C++20
    find_if_not(InputIterator first, InputIterator last, Predicate pred);

template <class ForwardIterator1, class ForwardIterator2>
    ForwardIterator1            // constexpr in C++20
    find_end(ForwardIterator1 first1, ForwardIterator1 last1,
             ForwardIterator2 first2, ForwardIterator2 last2);

template <class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
    ForwardIterator1            // constexpr in C++20
    find_end(ForwardIterator1 first1, ForwardIterator1 last1,
             ForwardIterator2 first2, ForwardIterator2 last2, BinaryPredicate pred);

template <class ForwardIterator1, class ForwardIterator2>
    constexpr ForwardIterator1  // constexpr in C++20
    find_first_of(ForwardIterator1 first1, ForwardIterator1 last1,
                  ForwardIterator2 first2, ForwardIterator2 last2);

template <class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
    constexpr ForwardIterator1  // constexpr in C++20
    find_first_of(ForwardIterator1 first1, ForwardIterator1 last1,
                  ForwardIterator2 first2, ForwardIterator2 last2, BinaryPredicate pred);

template <class ForwardIterator>
    constexpr ForwardIterator   // constexpr in C++20
    adjacent_find(ForwardIterator first, ForwardIterator last);

template <class ForwardIterator, class BinaryPredicate>
    constexpr ForwardIterator   // constexpr in C++20
    adjacent_find(ForwardIterator first, ForwardIterator last, BinaryPredicate pred);

template <class InputIterator, class T>
    constexpr typename iterator_traits<InputIterator>::difference_type  // constexpr in C++20
    count(InputIterator first, InputIterator last, const T& value);

template <class InputIterator, class Predicate>
    constexpr typename iterator_traits<InputIterator>::difference_type // constexpr in C++20
    count_if(InputIterator first, InputIterator last, Predicate pred);

template <class InputIterator1, class InputIterator2>
    constexpr pair<InputIterator1, InputIterator2>   // constexpr in C++20
    mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2);

template <class InputIterator1, class InputIterator2>
    constexpr pair<InputIterator1, InputIterator2>   // constexpr in C++20
    mismatch(InputIterator1 first1, InputIterator1 last1,
             InputIterator2 first2, InputIterator2 last2); // **C++14**

template <class InputIterator1, class InputIterator2, class BinaryPredicate>
    constexpr pair<InputIterator1, InputIterator2>   // constexpr in C++20
    mismatch(InputIterator1 first1, InputIterator1 last1,
             InputIterator2 first2, BinaryPredicate pred);

template <class InputIterator1, class InputIterator2, class BinaryPredicate>
    constexpr pair<InputIterator1, InputIterator2>   // constexpr in C++20
    mismatch(InputIterator1 first1, InputIterator1 last1,
             InputIterator2 first2, InputIterator2 last2,
             BinaryPredicate pred); // **C++14**

template <class InputIterator1, class InputIterator2>
    constexpr bool      // constexpr in C++20
    equal(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2);

template <class InputIterator1, class InputIterator2>
    constexpr bool      // constexpr in C++20
    equal(InputIterator1 first1, InputIterator1 last1,
          InputIterator2 first2, InputIterator2 last2); // **C++14**

template <class InputIterator1, class InputIterator2, class BinaryPredicate>
    constexpr bool      // constexpr in C++20
    equal(InputIterator1 first1, InputIterator1 last1,
          InputIterator2 first2, BinaryPredicate pred);

template <class InputIterator1, class InputIterator2, class BinaryPredicate>
    constexpr bool      // constexpr in C++20
    equal(InputIterator1 first1, InputIterator1 last1,
          InputIterator2 first2, InputIterator2 last2,
          BinaryPredicate pred); // **C++14**

template<class ForwardIterator1, class ForwardIterator2>
    constexpr bool      // constexpr in C++20
    is_permutation(ForwardIterator1 first1, ForwardIterator1 last1,
                   ForwardIterator2 first2);

template<class ForwardIterator1, class ForwardIterator2>
    constexpr bool      // constexpr in C++20
    is_permutation(ForwardIterator1 first1, ForwardIterator1 last1,
                   ForwardIterator2 first2, ForwardIterator2 last2); // **C++14**

template<class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
    constexpr bool      // constexpr in C++20
    is_permutation(ForwardIterator1 first1, ForwardIterator1 last1,
                   ForwardIterator2 first2, BinaryPredicate pred);

template<class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
    constexpr bool      // constexpr in C++20
    is_permutation(ForwardIterator1 first1, ForwardIterator1 last1,
                   ForwardIterator2 first2, ForwardIterator2 last2,
                   BinaryPredicate pred);  // **C++14**

template <class ForwardIterator1, class ForwardIterator2>
    constexpr ForwardIterator1      // constexpr in C++20
    search(ForwardIterator1 first1, ForwardIterator1 last1,
           ForwardIterator2 first2, ForwardIterator2 last2);

template <class ForwardIterator1, class ForwardIterator2, class BinaryPredicate>
    constexpr ForwardIterator1      // constexpr in C++20
    search(ForwardIterator1 first1, ForwardIterator1 last1,
           ForwardIterator2 first2, ForwardIterator2 last2, BinaryPredicate pred);

template <class ForwardIterator, class Size, class T>
    constexpr ForwardIterator       // constexpr in C++20
    search_n(ForwardIterator first, ForwardIterator last, Size count, const T& value);

template <class ForwardIterator, class Size, class T, class BinaryPredicate>
    constexpr ForwardIterator       // constexpr in C++20
    search_n(ForwardIterator first, ForwardIterator last,
             Size count, const T& value, BinaryPredicate pred);

template <class InputIterator, class OutputIterator>
    OutputIterator
    copy(InputIterator first, InputIterator last, OutputIterator result);

template<class InputIterator, class OutputIterator, class Predicate>
    OutputIterator
    copy_if(InputIterator first, InputIterator last,
            OutputIterator result, Predicate pred);

template<class InputIterator, class Size, class OutputIterator>
    OutputIterator
    copy_n(InputIterator first, Size n, OutputIterator result);

template <class BidirectionalIterator1, class BidirectionalIterator2>
    BidirectionalIterator2
    copy_backward(BidirectionalIterator1 first, BidirectionalIterator1 last,
                  BidirectionalIterator2 result);

template <class ForwardIterator1, class ForwardIterator2>
    ForwardIterator2
    swap_ranges(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2);

template <class ForwardIterator1, class ForwardIterator2>
    void
    iter_swap(ForwardIterator1 a, ForwardIterator2 b);

template <class InputIterator, class OutputIterator, class UnaryOperation>
    constexpr OutputIterator      // constexpr in C++20
    transform(InputIterator first, InputIterator last, OutputIterator result, UnaryOperation op);

template <class InputIterator1, class InputIterator2, class OutputIterator, class BinaryOperation>
    constexpr OutputIterator      // constexpr in C++20
    transform(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
              OutputIterator result, BinaryOperation binary_op);

template <class ForwardIterator, class T>
    constexpr void      // constexpr in C++20
    replace(ForwardIterator first, ForwardIterator last, const T& old_value, const T& new_value);

template <class ForwardIterator, class Predicate, class T>
    constexpr void      // constexpr in C++20
    replace_if(ForwardIterator first, ForwardIterator last, Predicate pred, const T& new_value);

template <class InputIterator, class OutputIterator, class T>
    constexpr OutputIterator      // constexpr in C++20
    replace_copy(InputIterator first, InputIterator last, OutputIterator result,
                 const T& old_value, const T& new_value);

template <class InputIterator, class OutputIterator, class Predicate, class T>
    constexpr OutputIterator      // constexpr in C++20
    replace_copy_if(InputIterator first, InputIterator last, OutputIterator result, Predicate pred, const T& new_value);

template <class ForwardIterator, class T>
    constexpr void      // constexpr in C++20
    fill(ForwardIterator first, ForwardIterator last, const T& value);

template <class OutputIterator, class Size, class T>
    constexpr OutputIterator      // constexpr in C++20
    fill_n(OutputIterator first, Size n, const T& value);

template <class ForwardIterator, class Generator>
    constexpr void      // constexpr in C++20
    generate(ForwardIterator first, ForwardIterator last, Generator gen);

template <class OutputIterator, class Size, class Generator>
    constexpr OutputIterator      // constexpr in C++20
    generate_n(OutputIterator first, Size n, Generator gen);

template <class ForwardIterator, class T>
    constexpr ForwardIterator     // constexpr in C++20
    remove(ForwardIterator first, ForwardIterator last, const T& value);

template <class ForwardIterator, class Predicate>
    constexpr ForwardIterator     // constexpr in C++20
    remove_if(ForwardIterator first, ForwardIterator last, Predicate pred);

template <class InputIterator, class OutputIterator, class T>
    constexpr OutputIterator     // constexpr in C++20
    remove_copy(InputIterator first, InputIterator last, OutputIterator result, const T& value);

template <class InputIterator, class OutputIterator, class Predicate>
    constexpr OutputIterator     // constexpr in C++20
    remove_copy_if(InputIterator first, InputIterator last, OutputIterator result, Predicate pred);

template <class ForwardIterator>
    ForwardIterator
    unique(ForwardIterator first, ForwardIterator last);

template <class ForwardIterator, class BinaryPredicate>
    ForwardIterator
    unique(ForwardIterator first, ForwardIterator last, BinaryPredicate pred);

template <class InputIterator, class OutputIterator>
    OutputIterator
    unique_copy(InputIterator first, InputIterator last, OutputIterator result);

template <class InputIterator, class OutputIterator, class BinaryPredicate>
    OutputIterator
    unique_copy(InputIterator first, InputIterator last, OutputIterator result, BinaryPredicate pred);

template <class BidirectionalIterator>
    void
    reverse(BidirectionalIterator first, BidirectionalIterator last);

template <class BidirectionalIterator, class OutputIterator>
    constexpr OutputIterator       // constexpr in C++20
    reverse_copy(BidirectionalIterator first, BidirectionalIterator last, OutputIterator result);

template <class ForwardIterator>
    ForwardIterator
    rotate(ForwardIterator first, ForwardIterator middle, ForwardIterator last);

template <class ForwardIterator, class OutputIterator>
    OutputIterator
    rotate_copy(ForwardIterator first, ForwardIterator middle, ForwardIterator last, OutputIterator result);

template <class RandomAccessIterator>
    void
    random_shuffle(RandomAccessIterator first, RandomAccessIterator last); // deprecated in C++14, removed in C++17

template <class RandomAccessIterator, class RandomNumberGenerator>
    void
    random_shuffle(RandomAccessIterator first, RandomAccessIterator last,
                   RandomNumberGenerator& rand);  // deprecated in C++14, removed in C++17

template<class PopulationIterator, class SampleIterator,
         class Distance, class UniformRandomBitGenerator>
    SampleIterator sample(PopulationIterator first, PopulationIterator last,
                          SampleIterator out, Distance n,
                          UniformRandomBitGenerator&& g); // C++17

template<class RandomAccessIterator, class UniformRandomNumberGenerator>
    void shuffle(RandomAccessIterator first, RandomAccessIterator last,
                 UniformRandomNumberGenerator&& g);

template <class InputIterator, class Predicate>
    constexpr bool  // constexpr in C++20
    is_partitioned(InputIterator first, InputIterator last, Predicate pred);

template <class ForwardIterator, class Predicate>
    ForwardIterator
    partition(ForwardIterator first, ForwardIterator last, Predicate pred);

template <class InputIterator, class OutputIterator1,
          class OutputIterator2, class Predicate>
    constexpr pair<OutputIterator1, OutputIterator2>   // constexpr in C++20
    partition_copy(InputIterator first, InputIterator last,
                   OutputIterator1 out_true, OutputIterator2 out_false,
                   Predicate pred);

template <class ForwardIterator, class Predicate>
    ForwardIterator
    stable_partition(ForwardIterator first, ForwardIterator last, Predicate pred);

template<class ForwardIterator, class Predicate>
    constexpr ForwardIterator  // constexpr in C++20
    partition_point(ForwardIterator first, ForwardIterator last, Predicate pred);

template <class ForwardIterator>
    constexpr bool  // constexpr in C++20
    is_sorted(ForwardIterator first, ForwardIterator last);

template <class ForwardIterator, class Compare>
    bool
    is_sorted(ForwardIterator first, ForwardIterator last, Compare comp);

template<class ForwardIterator>
    constexpr ForwardIterator    // constexpr in C++20
    is_sorted_until(ForwardIterator first, ForwardIterator last);

template <class ForwardIterator, class Compare>
    constexpr ForwardIterator    // constexpr in C++20
    is_sorted_until(ForwardIterator first, ForwardIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    sort(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    sort(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    stable_sort(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    stable_sort(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    partial_sort(RandomAccessIterator first, RandomAccessIterator middle, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    partial_sort(RandomAccessIterator first, RandomAccessIterator middle, RandomAccessIterator last, Compare comp);

template <class InputIterator, class RandomAccessIterator>
    RandomAccessIterator
    partial_sort_copy(InputIterator first, InputIterator last,
                      RandomAccessIterator result_first, RandomAccessIterator result_last);

template <class InputIterator, class RandomAccessIterator, class Compare>
    RandomAccessIterator
    partial_sort_copy(InputIterator first, InputIterator last,
                      RandomAccessIterator result_first, RandomAccessIterator result_last, Compare comp);

template <class RandomAccessIterator>
    void
    nth_element(RandomAccessIterator first, RandomAccessIterator nth, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    nth_element(RandomAccessIterator first, RandomAccessIterator nth, RandomAccessIterator last, Compare comp);

template <class ForwardIterator, class T>
    constexpr ForwardIterator                         // constexpr in C++20
    lower_bound(ForwardIterator first, ForwardIterator last, const T& value);

template <class ForwardIterator, class T, class Compare>
    constexpr ForwardIterator                         // constexpr in C++20
    lower_bound(ForwardIterator first, ForwardIterator last, const T& value, Compare comp);

template <class ForwardIterator, class T>
    constexpr ForwardIterator                         // constexpr in C++20
    upper_bound(ForwardIterator first, ForwardIterator last, const T& value);

template <class ForwardIterator, class T, class Compare>
    constexpr ForwardIterator                         // constexpr in C++20
    upper_bound(ForwardIterator first, ForwardIterator last, const T& value, Compare comp);

template <class ForwardIterator, class T>
    constexpr pair<ForwardIterator, ForwardIterator>  // constexpr in C++20
    equal_range(ForwardIterator first, ForwardIterator last, const T& value);

template <class ForwardIterator, class T, class Compare>
    constexpr pair<ForwardIterator, ForwardIterator>  // constexpr in C++20
    equal_range(ForwardIterator first, ForwardIterator last, const T& value, Compare comp);

template <class ForwardIterator, class T>
    constexpr bool                                    // constexpr in C++20
    binary_search(ForwardIterator first, ForwardIterator last, const T& value);

template <class ForwardIterator, class T, class Compare>
    constexpr bool                                    // constexpr in C++20
    binary_search(ForwardIterator first, ForwardIterator last, const T& value, Compare comp);

template <class InputIterator1, class InputIterator2, class OutputIterator>
    OutputIterator
    merge(InputIterator1 first1, InputIterator1 last1,
          InputIterator2 first2, InputIterator2 last2, OutputIterator result);

template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
    OutputIterator
    merge(InputIterator1 first1, InputIterator1 last1,
          InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);

template <class BidirectionalIterator>
    void
    inplace_merge(BidirectionalIterator first, BidirectionalIterator middle, BidirectionalIterator last);

template <class BidirectionalIterator, class Compare>
    void
    inplace_merge(BidirectionalIterator first, BidirectionalIterator middle, BidirectionalIterator last, Compare comp);

template <class InputIterator1, class InputIterator2>
    constexpr bool                                    // constexpr in C++20
    includes(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2);

template <class InputIterator1, class InputIterator2, class Compare>
    constexpr bool                                    // constexpr in C++20
    includes(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, Compare comp);

template <class InputIterator1, class InputIterator2, class OutputIterator>
    OutputIterator
    set_union(InputIterator1 first1, InputIterator1 last1,
              InputIterator2 first2, InputIterator2 last2, OutputIterator result);

template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
    OutputIterator
    set_union(InputIterator1 first1, InputIterator1 last1,
              InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);

template <class InputIterator1, class InputIterator2, class OutputIterator>
    constexpr OutputIterator                         // constexpr in C++20
    set_intersection(InputIterator1 first1, InputIterator1 last1,
                     InputIterator2 first2, InputIterator2 last2, OutputIterator result);

template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
    constexpr OutputIterator                         // constexpr in C++20
    set_intersection(InputIterator1 first1, InputIterator1 last1,
                     InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);

template <class InputIterator1, class InputIterator2, class OutputIterator>
    OutputIterator
    set_difference(InputIterator1 first1, InputIterator1 last1,
                   InputIterator2 first2, InputIterator2 last2, OutputIterator result);

template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
    OutputIterator
    set_difference(InputIterator1 first1, InputIterator1 last1,
                   InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);

template <class InputIterator1, class InputIterator2, class OutputIterator>
    OutputIterator
    set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
                             InputIterator2 first2, InputIterator2 last2, OutputIterator result);

template <class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
    OutputIterator
    set_symmetric_difference(InputIterator1 first1, InputIterator1 last1,
                             InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp);

template <class RandomAccessIterator>
    void
    push_heap(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    push_heap(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    pop_heap(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    pop_heap(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    make_heap(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    make_heap(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    void
    sort_heap(RandomAccessIterator first, RandomAccessIterator last);

template <class RandomAccessIterator, class Compare>
    void
    sort_heap(RandomAccessIterator first, RandomAccessIterator last, Compare comp);

template <class RandomAccessIterator>
    constexpr bool   // constexpr in C++20
    is_heap(RandomAccessIterator first, RandomAccessiterator last);

template <class RandomAccessIterator, class Compare>
    constexpr bool   // constexpr in C++20
    is_heap(RandomAccessIterator first, RandomAccessiterator last, Compare comp);

template <class RandomAccessIterator>
    constexpr RandomAccessIterator   // constexpr in C++20
    is_heap_until(RandomAccessIterator first, RandomAccessiterator last);

template <class RandomAccessIterator, class Compare>
    constexpr RandomAccessIterator   // constexpr in C++20
    is_heap_until(RandomAccessIterator first, RandomAccessiterator last, Compare comp);

template <class ForwardIterator>
    ForwardIterator
    min_element(ForwardIterator first, ForwardIterator last);  // constexpr in C++14

template <class ForwardIterator, class Compare>
    ForwardIterator
    min_element(ForwardIterator first, ForwardIterator last, Compare comp);  // constexpr in C++14

template <class T>
    const T&
    min(const T& a, const T& b);  // constexpr in C++14

template <class T, class Compare>
    const T&
    min(const T& a, const T& b, Compare comp);  // constexpr in C++14

template<class T>
    T
    min(::std::initializer_list<T> t);  // constexpr in C++14

template<class T, class Compare>
    T
    min(::std::initializer_list<T> t, Compare comp);  // constexpr in C++14

template<class T>
    constexpr const T& clamp( const T& v, const T& lo, const T& hi );               // C++17

template<class T, class Compare>
    constexpr const T& clamp( const T& v, const T& lo, const T& hi, Compare comp ); // C++17

template <class ForwardIterator>
    ForwardIterator
    max_element(ForwardIterator first, ForwardIterator last);  // constexpr in C++14

template <class ForwardIterator, class Compare>
    ForwardIterator
    max_element(ForwardIterator first, ForwardIterator last, Compare comp);  // constexpr in C++14

template <class T>
    const T&
    max(const T& a, const T& b); // constexpr in C++14

template <class T, class Compare>
    const T&
    max(const T& a, const T& b, Compare comp);  // constexpr in C++14

template<class T>
    T
    max(::std::initializer_list<T> t);  // constexpr in C++14

template<class T, class Compare>
    T
    max(::std::initializer_list<T> t, Compare comp);  // constexpr in C++14

template<class ForwardIterator>
    pair<ForwardIterator, ForwardIterator>
    minmax_element(ForwardIterator first, ForwardIterator last);   // constexpr in C++14

template<class ForwardIterator, class Compare>
    pair<ForwardIterator, ForwardIterator>
    minmax_element(ForwardIterator first, ForwardIterator last, Compare comp);   // constexpr in C++14

template<class T>
    pair<const T&, const T&>
    minmax(const T& a, const T& b);  // constexpr in C++14

template<class T, class Compare>
    pair<const T&, const T&>
    minmax(const T& a, const T& b, Compare comp);  // constexpr in C++14

template<class T>
    pair<T, T>
    minmax(::std::initializer_list<T> t);  // constexpr in C++14

template<class T, class Compare>
    pair<T, T>
    minmax(::std::initializer_list<T> t, Compare comp);  // constexpr in C++14

template <class InputIterator1, class InputIterator2>
    constexpr bool     // constexpr in C++20
    lexicographical_compare(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2);

template <class InputIterator1, class InputIterator2, class Compare>
    constexpr bool     // constexpr in C++20
    lexicographical_compare(InputIterator1 first1, InputIterator1 last1,
                            InputIterator2 first2, InputIterator2 last2, Compare comp);

template <class BidirectionalIterator>
    bool
    next_permutation(BidirectionalIterator first, BidirectionalIterator last);

template <class BidirectionalIterator, class Compare>
    bool
    next_permutation(BidirectionalIterator first, BidirectionalIterator last, Compare comp);

template <class BidirectionalIterator>
    bool
    prev_permutation(BidirectionalIterator first, BidirectionalIterator last);

template <class BidirectionalIterator, class Compare>
    bool
    prev_permutation(BidirectionalIterator first, BidirectionalIterator last, Compare comp);

}  // std

*/
#ifndef __cuda_std__
#include <__config>
#endif // __cuda_std__

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include "__algorithm/adjacent_find.h"
#include "__algorithm/all_of.h"
#include "__algorithm/any_of.h"
#include "__algorithm/binary_search.h"
#include "__algorithm/clamp.h"
#include "__algorithm/comp_ref_type.h"
#include "__algorithm/comp.h"
#include "__algorithm/copy_backward.h"
#include "__algorithm/copy_if.h"
#include "__algorithm/copy_n.h"
#include "__algorithm/copy.h"
#include "__algorithm/count_if.h"
#include "__algorithm/count.h"
#include "__algorithm/equal_range.h"
#include "__algorithm/equal.h"
#include "__algorithm/fill_n.h"
#include "__algorithm/fill.h"
#include "__algorithm/find_end.h"
#include "__algorithm/find_first_of.h"
#include "__algorithm/find_if_not.h"
#include "__algorithm/find_if.h"
#include "__algorithm/find.h"
#include "__algorithm/for_each_n.h"
#include "__algorithm/for_each.h"
#include "__algorithm/generate_n.h"
#include "__algorithm/generate.h"
#include "__algorithm/half_positive.h"
#include "__algorithm/includes.h"
#include "__algorithm/is_heap_until.h"
#include "__algorithm/is_heap.h"
#include "__algorithm/is_partitioned.h"
#include "__algorithm/is_permutation.h"
#include "__algorithm/is_sorted_until.h"
#include "__algorithm/is_sorted.h"
#include "__algorithm/iter_swap.h"
#include "__algorithm/iterator_operations.h"
#include "__algorithm/lexicographical_compare.h"
#include "__algorithm/lower_bound.h"
#include "__algorithm/make_heap.h"
#include "__algorithm/make_projected.h"
#include "__algorithm/max_element.h"
#include "__algorithm/max.h"
#include "__algorithm/merge.h"
#include "__algorithm/min_element.h"
#include "__algorithm/min.h"
#include "__algorithm/minmax_element.h"
#include "__algorithm/minmax.h"
#include "__algorithm/mismatch.h"
#include "__algorithm/move_backward.h"
#include "__algorithm/move.h"
#include "__algorithm/next_permutation.h"
#include "__algorithm/none_of.h"
#include "__algorithm/partial_sort_copy.h"
#include "__algorithm/partial_sort.h"
#include "__algorithm/partition_copy.h"
#include "__algorithm/partition_point.h"
#include "__algorithm/partition.h"
#include "__algorithm/pop_heap.h"
#include "__algorithm/prev_permutation.h"
#include "__algorithm/push_heap.h"
#include "__algorithm/ranges_iterator_concept.h"
#include "__algorithm/remove_copy_if.h"
#include "__algorithm/remove_copy.h"
#include "__algorithm/remove_if.h"
#include "__algorithm/remove.h"
#include "__algorithm/replace_copy_if.h"
#include "__algorithm/replace_copy.h"
#include "__algorithm/replace_if.h"
#include "__algorithm/replace.h"
#include "__algorithm/reverse_copy.h"
#include "__algorithm/reverse.h"
#include "__algorithm/rotate_copy.h"
#include "__algorithm/rotate.h"
#include "__algorithm/search_n.h"
#include "__algorithm/search.h"
#include "__algorithm/set_difference.h"
#include "__algorithm/set_intersection.h"
#include "__algorithm/set_symmetric_difference.h"
#include "__algorithm/set_union.h"
#include "__algorithm/shift_left.h"
#include "__algorithm/shift_right.h"
#include "__algorithm/sift_down.h"
#include "__algorithm/sort_heap.h"
#include "__algorithm/swap_ranges.h"
#include "__algorithm/transform.h"
#include "__algorithm/unique_copy.h"
#include "__algorithm/unique.h"
#include "__algorithm/upper_bound.h"
#include "__assert" // all public C++ headers provide the assertion handler
#include "__debug"
#include "__iterator/distance.h"
#include "__iterator/iterator_traits.h"
#include "__iterator/move_iterator.h"
#include "__iterator/next.h"
#include "__iterator/prev.h"
#include "__iterator/reverse_iterator.h"
#include "__iterator/wrap_iter.h"
#include "__type_traits/common_type.h"
#include "__type_traits/enable_if.h"
#include "__type_traits/is_integral.h"
#include "__type_traits/is_same.h"
#include "__type_traits/is_trivially_copy_assignable.h"
#include "__type_traits/make_unsigned.h"
#include "__type_traits/remove_const.h"
#include "bit"
#include "cstddef"
#include "functional"
#include "initializer_list"
#include "type_traits"
#include "version"

#ifndef __cuda_std__
#include <cstring>
#include <memory>
#endif // __cuda_std__

#ifndef __cuda_std__
#include <__pragma_push>
#endif // __cuda_std__

_LIBCUDACXX_BEGIN_NAMESPACE_STD

#ifndef __cuda_std__

template <class _Predicate>
class __invert // invert the sense of a comparison
{
private:
    _Predicate __p_;
public:
    _LIBCUDACXX_INLINE_VISIBILITY __invert() {}

    _LIBCUDACXX_INLINE_VISIBILITY
    explicit __invert(_Predicate __p) : __p_(__p) {}

    template <class _T1>
    _LIBCUDACXX_INLINE_VISIBILITY
    bool operator()(const _T1& __x) {return !__p_(__x);}

    template <class _T1, class _T2>
    _LIBCUDACXX_INLINE_VISIBILITY
    bool operator()(const _T1& __x, const _T2& __y) {return __p_(__y, __x);}
};

// random_shuffle

// __independent_bits_engine

template <unsigned long long _Xp, size_t _Rp>
struct __log2_imp
{
    static const size_t value = _Xp & ((unsigned long long)(1) << _Rp) ? _Rp
                                           : __log2_imp<_Xp, _Rp - 1>::value;
};

template <unsigned long long _Xp>
struct __log2_imp<_Xp, 0>
{
    static const size_t value = 0;
};

template <size_t _Rp>
struct __log2_imp<0, _Rp>
{
    static const size_t value = _Rp + 1;
};

template <class _UIntType, _UIntType _Xp>
struct __log2
{
    static const size_t value = __log2_imp<_Xp,
                                         sizeof(_UIntType) * __CHAR_BIT__ - 1>::value;
};

template<class _Engine, class _UIntType>
class __independent_bits_engine
{
public:
    // types
    typedef _UIntType result_type;

private:
    typedef typename _Engine::result_type _Engine_result_type;
    typedef __conditional_t
        <
            sizeof(_Engine_result_type) <= sizeof(result_type),
                result_type,
                _Engine_result_type
        > _Working_result_type;

    _Engine& __e_;
    size_t __w_;
    size_t __w0_;
    size_t __n_;
    size_t __n0_;
    _Working_result_type __y0_;
    _Working_result_type __y1_;
    _Engine_result_type __mask0_;
    _Engine_result_type __mask1_;

    static constexpr _Working_result_type _Rp =
        _Engine::max() - _Engine::min() + _Working_result_type(1);
    static constexpr size_t __m = __log2<_Working_result_type, _Rp>::value;
    static constexpr size_t _WDt = numeric_limits<_Working_result_type>::digits;
    static constexpr size_t _EDt = numeric_limits<_Engine_result_type>::digits;

public:
    // constructors and seeding functions
    __independent_bits_engine(_Engine& __e, size_t __w);

    // generating functions
    result_type operator()() {return __eval(integral_constant<bool, _Rp != 0>());}

private:
    result_type __eval(false_type);
    result_type __eval(true_type);
};

template<class _Engine, class _UIntType>
__independent_bits_engine<_Engine, _UIntType>
    ::__independent_bits_engine(_Engine& __e, size_t __w)
        : __e_(__e),
          __w_(__w)
{
    __n_ = __w_ / __m + (__w_ % __m != 0);
    __w0_ = __w_ / __n_;
    if (_Rp == 0)
        __y0_ = _Rp;
    else if (__w0_ < _WDt)
        __y0_ = (_Rp >> __w0_) << __w0_;
    else
        __y0_ = 0;
    if (_Rp - __y0_ > __y0_ / __n_)
    {
        ++__n_;
        __w0_ = __w_ / __n_;
        if (__w0_ < _WDt)
            __y0_ = (_Rp >> __w0_) << __w0_;
        else
            __y0_ = 0;
    }
    __n0_ = __n_ - __w_ % __n_;
    if (__w0_ < _WDt - 1)
        __y1_ = (_Rp >> (__w0_ + 1)) << (__w0_ + 1);
    else
        __y1_ = 0;
    __mask0_ = __w0_ > 0 ? _Engine_result_type(~0) >> (_EDt - __w0_) :
                          _Engine_result_type(0);
    __mask1_ = __w0_ < _EDt - 1 ?
                               _Engine_result_type(~0) >> (_EDt - (__w0_ + 1)) :
                               _Engine_result_type(~0);
}

template<class _Engine, class _UIntType>
inline
_UIntType
__independent_bits_engine<_Engine, _UIntType>::__eval(false_type)
{
    return static_cast<result_type>(__e_() & __mask0_);
}

template<class _Engine, class _UIntType>
_UIntType
__independent_bits_engine<_Engine, _UIntType>::__eval(true_type)
{
    const size_t _WRt = numeric_limits<result_type>::digits;
    result_type _Sp = 0;
    for (size_t __k = 0; __k < __n0_; ++__k)
    {
        _Engine_result_type __u;
        do
        {
            __u = __e_() - _Engine::min();
        } while (__u >= __y0_);
        if (__w0_ < _WRt)
            _Sp <<= __w0_;
        else
            _Sp = 0;
        _Sp += __u & __mask0_;
    }
    for (size_t __k = __n0_; __k < __n_; ++__k)
    {
        _Engine_result_type __u;
        do
        {
            __u = __e_() - _Engine::min();
        } while (__u >= __y1_);
        if (__w0_ < _WRt - 1)
            _Sp <<= __w0_ + 1;
        else
            _Sp = 0;
        _Sp += __u & __mask1_;
    }
    return _Sp;
}

// uniform_int_distribution

template<class _IntType = int>
class uniform_int_distribution
{
public:
    // types
    typedef _IntType result_type;

    class param_type
    {
        result_type __a_;
        result_type __b_;
    public:
        typedef uniform_int_distribution distribution_type;

        explicit param_type(result_type __a = 0,
                            result_type __b = numeric_limits<result_type>::max())
            : __a_(__a), __b_(__b) {}

        result_type a() const {return __a_;}
        result_type b() const {return __b_;}

        friend bool operator==(const param_type& __x, const param_type& __y)
            {return __x.__a_ == __y.__a_ && __x.__b_ == __y.__b_;}
        friend bool operator!=(const param_type& __x, const param_type& __y)
            {return !(__x == __y);}
    };

private:
    param_type __p_;

public:
    // constructors and reset functions
    explicit uniform_int_distribution(result_type __a = 0,
                                      result_type __b = numeric_limits<result_type>::max())
        : __p_(param_type(__a, __b)) {}
    explicit uniform_int_distribution(const param_type& __p) : __p_(__p) {}
    void reset() {}

    // generating functions
    template<class _URNG> result_type operator()(_URNG& __g)
        {return (*this)(__g, __p_);}
    template<class _URNG> result_type operator()(_URNG& __g, const param_type& __p);

    // property functions
    result_type a() const {return __p_.a();}
    result_type b() const {return __p_.b();}

    param_type param() const {return __p_;}
    void param(const param_type& __p) {__p_ = __p;}

    result_type min() const {return a();}
    result_type max() const {return b();}

    friend bool operator==(const uniform_int_distribution& __x,
                           const uniform_int_distribution& __y)
        {return __x.__p_ == __y.__p_;}
    friend bool operator!=(const uniform_int_distribution& __x,
                           const uniform_int_distribution& __y)
            {return !(__x == __y);}
};

template<class _IntType>
template<class _URNG>
typename uniform_int_distribution<_IntType>::result_type
uniform_int_distribution<_IntType>::operator()(_URNG& __g, const param_type& __p)
_LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK
{
    typedef __conditional_t<sizeof(result_type) <= sizeof(uint32_t),
                                            uint32_t, uint64_t> _UIntType;
    const _UIntType _Rp = _UIntType(__p.b()) - _UIntType(__p.a()) + _UIntType(1);
    if (_Rp == 1)
        return __p.a();
    const size_t _Dt = numeric_limits<_UIntType>::digits;
    typedef __independent_bits_engine<_URNG, _UIntType> _Eng;
    if (_Rp == 0)
        return static_cast<result_type>(_Eng(__g, _Dt)());
    size_t __w = _Dt - __libcpp_clz(_Rp) - 1;
    if ((_Rp & (std::numeric_limits<_UIntType>::max() >> (_Dt - __w))) != 0)
        ++__w;
    _Eng __e(__g, __w);
    _UIntType __u;
    do
    {
        __u = __e();
    } while (__u >= _Rp);
    return static_cast<result_type>(__u + __p.a());
}

template <class _PopulationIterator, class _SampleIterator, class _Distance,
          class _UniformRandomNumberGenerator>
_LIBCUDACXX_INLINE_VISIBILITY
_SampleIterator __sample(_PopulationIterator __first,
                         _PopulationIterator __last, _SampleIterator __output_iter,
                         _Distance __n,
                         _UniformRandomNumberGenerator & __g,
                         input_iterator_tag) {

  _Distance __k = 0;
  for (; __first != __last && __k < __n; ++__first, (void) ++__k)
    __output_iter[__k] = *__first;
  _Distance __sz = __k;
  for (; __first != __last; ++__first, (void) ++__k) {
    _Distance __r = _CUDA_VSTD::uniform_int_distribution<_Distance>(0, __k)(__g);
    if (__r < __sz)
      __output_iter[__r] = *__first;
  }
  return __output_iter + _CUDA_VSTD::min(__n, __k);
}

template <class _PopulationIterator, class _SampleIterator, class _Distance,
          class _UniformRandomNumberGenerator>
_LIBCUDACXX_INLINE_VISIBILITY
_SampleIterator __sample(_PopulationIterator __first,
                         _PopulationIterator __last, _SampleIterator __output_iter,
                         _Distance __n,
                         _UniformRandomNumberGenerator& __g,
                         forward_iterator_tag) {
  _Distance __unsampled_sz = _CUDA_VSTD::distance(__first, __last);
  for (__n = _CUDA_VSTD::min(__n, __unsampled_sz); __n != 0; ++__first) {
    _Distance __r =
        _CUDA_VSTD::uniform_int_distribution<_Distance>(0, --__unsampled_sz)(__g);
    if (__r < __n) {
      *__output_iter++ = *__first;
      --__n;
    }
  }
  return __output_iter;
}

template <class _PopulationIterator, class _SampleIterator, class _Distance,
          class _UniformRandomNumberGenerator>
_LIBCUDACXX_INLINE_VISIBILITY
_SampleIterator __sample(_PopulationIterator __first,
                         _PopulationIterator __last, _SampleIterator __output_iter,
                         _Distance __n, _UniformRandomNumberGenerator& __g) {
  typedef typename iterator_traits<_PopulationIterator>::iterator_category
        _PopCategory;
  typedef typename iterator_traits<_PopulationIterator>::difference_type
        _Difference;
  static_assert(__is_cpp17_forward_iterator<_PopulationIterator>::value ||
                __is_cpp17_random_access_iterator<_SampleIterator>::value,
                "SampleIterator must meet the requirements of RandomAccessIterator");
  typedef typename common_type<_Distance, _Difference>::type _CommonType;
  _LIBCUDACXX_ASSERT(__n >= 0, "N must be a positive number.");
  return _CUDA_VSTD::__sample(
      __first, __last, __output_iter, _CommonType(__n),
      __g, _PopCategory());
}

#if _CCCL_STD_VER > 2014
template <class _PopulationIterator, class _SampleIterator, class _Distance,
          class _UniformRandomNumberGenerator>
inline _LIBCUDACXX_INLINE_VISIBILITY
_SampleIterator sample(_PopulationIterator __first,
                       _PopulationIterator __last, _SampleIterator __output_iter,
                       _Distance __n, _UniformRandomNumberGenerator&& __g) {
    return _CUDA_VSTD::__sample(__first, __last, __output_iter, __n, __g);
}
#endif // _CCCL_STD_VER > 2014

template<class _RandomAccessIterator, class _UniformRandomNumberGenerator>
_LIBCUDACXX_INLINE_VISIBILITY
    void shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last,
                 _UniformRandomNumberGenerator&& __g)
{
    typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
    typedef uniform_int_distribution<ptrdiff_t> _Dp;
    typedef typename _Dp::param_type _Pp;
    difference_type __d = __last - __first;
    if (__d > 1)
    {
        _Dp __uid;
        for (--__last, (void) --__d; __first < __last; ++__first, (void) --__d)
        {
            difference_type __i = __uid(__g, _Pp(0, __d));
            if (__i != difference_type(0))
                swap(*__first, *(__first + __i));
        }
    }
}

// stable_partition

template <class _Predicate, class _ForwardIterator, class _Distance, class _Pair>
_LIBCUDACXX_HOST_DEVICE
_ForwardIterator
__stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred,
                   _Distance __len, _Pair __p, forward_iterator_tag __fit)
{
    // *__first is known to be false
    // __len >= 1
    if (__len == 1)
        return __first;
    if (__len == 2)
    {
        _ForwardIterator __m = __first;
        if (__pred(*++__m))
        {
            swap(*__first, *__m);
            return __m;
        }
        return __first;
    }
    if (__len <= __p.second)
    {   // The buffer is big enough to use
        typedef typename iterator_traits<_ForwardIterator>::value_type value_type;
        __destruct_n __d(0);
        unique_ptr<value_type, __destruct_n&> __h(__p.first, __d);
        // Move the falses into the temporary buffer, and the trues to the front of the line
        // Update __first to always point to the end of the trues
        value_type* __t = __p.first;
        ::new(__t) value_type(_CUDA_VSTD::move(*__first));
        __d.__incr((value_type*)0);
        ++__t;
        _ForwardIterator __i = __first;
        while (++__i != __last)
        {
            if (__pred(*__i))
            {
                *__first = _CUDA_VSTD::move(*__i);
                ++__first;
            }
            else
            {
                ::new(__t) value_type(_CUDA_VSTD::move(*__i));
                __d.__incr((value_type*)0);
                ++__t;
            }
        }
        // All trues now at start of range, all falses in buffer
        // Move falses back into range, but don't mess up __first which points to first false
        __i = __first;
        for (value_type* __t2 = __p.first; __t2 < __t; ++__t2, (void) ++__i)
            *__i = _CUDA_VSTD::move(*__t2);
        // __h destructs moved-from values out of the temp buffer, but doesn't deallocate buffer
        return __first;
    }
    // Else not enough buffer, do in place
    // __len >= 3
    _ForwardIterator __m = __first;
    _Distance __len2 = __len / 2;  // __len2 >= 2
    _CUDA_VSTD::advance(__m, __len2);
    // recurse on [__first, __m), *__first know to be false
    // F?????????????????
    // f       m         l
    typedef __add_lvalue_reference_t<_Predicate> _PredRef;
    _ForwardIterator __first_false = __stable_partition<_PredRef>(__first, __m, __pred, __len2, __p, __fit);
    // TTTFFFFF??????????
    // f  ff   m         l
    // recurse on [__m, __last], except increase __m until *(__m) is false, *__last know to be true
    _ForwardIterator __m1 = __m;
    _ForwardIterator __second_false = __last;
    _Distance __len_half = __len - __len2;
    while (__pred(*__m1))
    {
        if (++__m1 == __last)
            goto __second_half_done;
        --__len_half;
    }
    // TTTFFFFFTTTF??????
    // f  ff   m  m1     l
    __second_false = __stable_partition<_PredRef>(__m1, __last, __pred, __len_half, __p, __fit);
__second_half_done:
    // TTTFFFFFTTTTTFFFFF
    // f  ff   m    sf   l
    return _CUDA_VSTD::rotate(__first_false, __m, __second_false);
    // TTTTTTTTFFFFFFFFFF
    //         |
}

struct __return_temporary_buffer
{
    template <class _Tp>
    _LIBCUDACXX_INLINE_VISIBILITY void operator()(_Tp* __p) const {_CUDA_VSTD::return_temporary_buffer(__p);}
};

template <class _Predicate, class _ForwardIterator>
_LIBCUDACXX_HOST_DEVICE
_ForwardIterator
__stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred,
                   forward_iterator_tag)
{
    const unsigned __alloc_limit = 3;  // might want to make this a function of trivial assignment
    // Either prove all true and return __first or point to first false
    while (true)
    {
        if (__first == __last)
            return __first;
        if (!__pred(*__first))
            break;
        ++__first;
    }
    // We now have a reduced range [__first, __last)
    // *__first is known to be false
    typedef typename iterator_traits<_ForwardIterator>::difference_type difference_type;
    typedef typename iterator_traits<_ForwardIterator>::value_type value_type;
    difference_type __len = _CUDA_VSTD::distance(__first, __last);
    pair<value_type*, ptrdiff_t> __p(0, 0);
    unique_ptr<value_type, __return_temporary_buffer> __h;
    if (__len >= __alloc_limit)
    {
        __p = _CUDA_VSTD::get_temporary_buffer<value_type>(__len);
        __h.reset(__p.first);
    }
    return __stable_partition<__add_lvalue_reference_t<_Predicate>>
                             (__first, __last, __pred, __len, __p, forward_iterator_tag());
}

template <class _Predicate, class _BidirectionalIterator, class _Distance, class _Pair>
_LIBCUDACXX_HOST_DEVICE
_BidirectionalIterator
__stable_partition(_BidirectionalIterator __first, _BidirectionalIterator __last, _Predicate __pred,
                   _Distance __len, _Pair __p, bidirectional_iterator_tag __bit)
{
    // *__first is known to be false
    // *__last is known to be true
    // __len >= 2
    if (__len == 2)
    {
        swap(*__first, *__last);
        return __last;
    }
    if (__len == 3)
    {
        _BidirectionalIterator __m = __first;
        if (__pred(*++__m))
        {
            swap(*__first, *__m);
            swap(*__m, *__last);
            return __last;
        }
        swap(*__m, *__last);
        swap(*__first, *__m);
        return __m;
    }
    if (__len <= __p.second)
    {   // The buffer is big enough to use
        typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
        __destruct_n __d(0);
        unique_ptr<value_type, __destruct_n&> __h(__p.first, __d);
        // Move the falses into the temporary buffer, and the trues to the front of the line
        // Update __first to always point to the end of the trues
        value_type* __t = __p.first;
        ::new(__t) value_type(_CUDA_VSTD::move(*__first));
        __d.__incr((value_type*)0);
        ++__t;
        _BidirectionalIterator __i = __first;
        while (++__i != __last)
        {
            if (__pred(*__i))
            {
                *__first = _CUDA_VSTD::move(*__i);
                ++__first;
            }
            else
            {
                ::new(__t) value_type(_CUDA_VSTD::move(*__i));
                __d.__incr((value_type*)0);
                ++__t;
            }
        }
        // move *__last, known to be true
        *__first = _CUDA_VSTD::move(*__i);
        __i = ++__first;
        // All trues now at start of range, all falses in buffer
        // Move falses back into range, but don't mess up __first which points to first false
        for (value_type* __t2 = __p.first; __t2 < __t; ++__t2, (void) ++__i)
            *__i = _CUDA_VSTD::move(*__t2);
        // __h destructs moved-from values out of the temp buffer, but doesn't deallocate buffer
        return __first;
    }
    // Else not enough buffer, do in place
    // __len >= 4
    _BidirectionalIterator __m = __first;
    _Distance __len2 = __len / 2;  // __len2 >= 2
    _CUDA_VSTD::advance(__m, __len2);
    // recurse on [__first, __m-1], except reduce __m-1 until *(__m-1) is true, *__first know to be false
    // F????????????????T
    // f       m        l
    _BidirectionalIterator __m1 = __m;
    _BidirectionalIterator __first_false = __first;
    _Distance __len_half = __len2;
    while (!__pred(*--__m1))
    {
        if (__m1 == __first)
            goto __first_half_done;
        --__len_half;
    }
    // F???TFFF?????????T
    // f   m1  m        l
    typedef __add_lvalue_reference_t<_Predicate> _PredRef;
    __first_false = __stable_partition<_PredRef>(__first, __m1, __pred, __len_half, __p, __bit);
__first_half_done:
    // TTTFFFFF?????????T
    // f  ff   m        l
    // recurse on [__m, __last], except increase __m until *(__m) is false, *__last know to be true
    __m1 = __m;
    _BidirectionalIterator __second_false = __last;
    ++__second_false;
    __len_half = __len - __len2;
    while (__pred(*__m1))
    {
        if (++__m1 == __last)
            goto __second_half_done;
        --__len_half;
    }
    // TTTFFFFFTTTF?????T
    // f  ff   m  m1    l
    __second_false = __stable_partition<_PredRef>(__m1, __last, __pred, __len_half, __p, __bit);
__second_half_done:
    // TTTFFFFFTTTTTFFFFF
    // f  ff   m    sf  l
    return _CUDA_VSTD::rotate(__first_false, __m, __second_false);
    // TTTTTTTTFFFFFFFFFF
    //         |
}

template <class _Predicate, class _BidirectionalIterator>
_LIBCUDACXX_HOST_DEVICE
_BidirectionalIterator
__stable_partition(_BidirectionalIterator __first, _BidirectionalIterator __last, _Predicate __pred,
                   bidirectional_iterator_tag)
{
    typedef typename iterator_traits<_BidirectionalIterator>::difference_type difference_type;
    typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
    const difference_type __alloc_limit = 4;  // might want to make this a function of trivial assignment
    // Either prove all true and return __first or point to first false
    while (true)
    {
        if (__first == __last)
            return __first;
        if (!__pred(*__first))
            break;
        ++__first;
    }
    // __first points to first false, everything prior to __first is already set.
    // Either prove [__first, __last) is all false and return __first, or point __last to last true
    do
    {
        if (__first == --__last)
            return __first;
    } while (!__pred(*__last));
    // We now have a reduced range [__first, __last]
    // *__first is known to be false
    // *__last is known to be true
    // __len >= 2
    difference_type __len = _CUDA_VSTD::distance(__first, __last) + 1;
    pair<value_type*, ptrdiff_t> __p(0, 0);
    unique_ptr<value_type, __return_temporary_buffer> __h;
    if (__len >= __alloc_limit)
    {
        __p = _CUDA_VSTD::get_temporary_buffer<value_type>(__len);
        __h.reset(__p.first);
    }
    return __stable_partition<__add_lvalue_reference_t<_Predicate>>
                             (__first, __last, __pred, __len, __p, bidirectional_iterator_tag());
}

template <class _ForwardIterator, class _Predicate>
inline _LIBCUDACXX_INLINE_VISIBILITY
_ForwardIterator
stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
{
    return __stable_partition<__add_lvalue_reference_t<_Predicate>>
                             (__first, __last, __pred, typename iterator_traits<_ForwardIterator>::iterator_category());
}

// sort

// stable, 2-3 compares, 0-2 swaps

template <class _Compare, class _ForwardIterator>
_LIBCUDACXX_HOST_DEVICE
unsigned
__sort3(_ForwardIterator __x, _ForwardIterator __y, _ForwardIterator __z, _Compare __c)
{
    unsigned __r = 0;
    if (!__c(*__y, *__x))          // if x <= y
    {
        if (!__c(*__z, *__y))      // if y <= z
            return __r;            // x <= y && y <= z
                                   // x <= y && y > z
        swap(*__y, *__z);          // x <= z && y < z
        __r = 1;
        if (__c(*__y, *__x))       // if x > y
        {
            swap(*__x, *__y);      // x < y && y <= z
            __r = 2;
        }
        return __r;                // x <= y && y < z
    }
    if (__c(*__z, *__y))           // x > y, if y > z
    {
        swap(*__x, *__z);          // x < y && y < z
        __r = 1;
        return __r;
    }
    swap(*__x, *__y);              // x > y && y <= z
    __r = 1;                       // x < y && x <= z
    if (__c(*__z, *__y))           // if y > z
    {
        swap(*__y, *__z);          // x <= y && y < z
        __r = 2;
    }
    return __r;
}                                  // x <= y && y <= z

// stable, 3-6 compares, 0-5 swaps

template <class _Compare, class _ForwardIterator>
_LIBCUDACXX_HOST_DEVICE
unsigned
__sort4(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3,
            _ForwardIterator __x4, _Compare __c)
{
    unsigned __r = __sort3<_Compare>(__x1, __x2, __x3, __c);
    if (__c(*__x4, *__x3))
    {
        swap(*__x3, *__x4);
        ++__r;
        if (__c(*__x3, *__x2))
        {
            swap(*__x2, *__x3);
            ++__r;
            if (__c(*__x2, *__x1))
            {
                swap(*__x1, *__x2);
                ++__r;
            }
        }
    }
    return __r;
}

// stable, 4-10 compares, 0-9 swaps

template <class _Compare, class _ForwardIterator>
_LIBCUDACXX_HIDDEN
_LIBCUDACXX_HOST_DEVICE
unsigned
__sort5(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3,
            _ForwardIterator __x4, _ForwardIterator __x5, _Compare __c)
{
    unsigned __r = __sort4<_Compare>(__x1, __x2, __x3, __x4, __c);
    if (__c(*__x5, *__x4))
    {
        swap(*__x4, *__x5);
        ++__r;
        if (__c(*__x4, *__x3))
        {
            swap(*__x3, *__x4);
            ++__r;
            if (__c(*__x3, *__x2))
            {
                swap(*__x2, *__x3);
                ++__r;
                if (__c(*__x2, *__x1))
                {
                    swap(*__x1, *__x2);
                    ++__r;
                }
            }
        }
    }
    return __r;
}

// Assumes size > 0
template <class _Compare, class _BirdirectionalIterator>
_LIBCUDACXX_HOST_DEVICE
void
__selection_sort(_BirdirectionalIterator __first, _BirdirectionalIterator __last, _Compare __comp)
{
    _BirdirectionalIterator __lm1 = __last;
    for (--__lm1; __first != __lm1; ++__first)
    {
        _BirdirectionalIterator __i = _CUDA_VSTD::min_element<_BirdirectionalIterator,
                                                        __add_lvalue_reference_t<_Compare>>
                                                       (__first, __last, __comp);
        if (__i != __first)
            swap(*__first, *__i);
    }
}

template <class _Compare, class _BirdirectionalIterator>
_LIBCUDACXX_HOST_DEVICE
void
__insertion_sort(_BirdirectionalIterator __first, _BirdirectionalIterator __last, _Compare __comp)
{
    typedef typename iterator_traits<_BirdirectionalIterator>::value_type value_type;
    if (__first != __last)
    {
        _BirdirectionalIterator __i = __first;
        for (++__i; __i != __last; ++__i)
        {
            _BirdirectionalIterator __j = __i;
            value_type __t(_CUDA_VSTD::move(*__j));
            for (_BirdirectionalIterator __k = __i; __k != __first && __comp(__t,  *--__k); --__j)
                *__j = _CUDA_VSTD::move(*__k);
            *__j = _CUDA_VSTD::move(__t);
        }
    }
}

template <class _Compare, class _RandomAccessIterator>
_LIBCUDACXX_HOST_DEVICE
void
__insertion_sort_3(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
{
    typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
    _RandomAccessIterator __j = __first+2;
    __sort3<_Compare>(__first, __first+1, __j, __comp);
    for (_RandomAccessIterator __i = __j+1; __i != __last; ++__i)
    {
        if (__comp(*__i, *__j))
        {
            value_type __t(_CUDA_VSTD::move(*__i));
            _RandomAccessIterator __k = __j;
            __j = __i;
            do
            {
                *__j = _CUDA_VSTD::move(*__k);
                __j = __k;
            } while (__j != __first && __comp(__t, *--__k));
            *__j = _CUDA_VSTD::move(__t);
        }
        __j = __i;
    }
}

template <class _Compare, class _RandomAccessIterator>
_LIBCUDACXX_HOST_DEVICE
bool
__insertion_sort_incomplete(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
{
    switch (__last - __first)
    {
    case 0:
    case 1:
        return true;
    case 2:
        if (__comp(*--__last, *__first))
            swap(*__first, *__last);
        return true;
    case 3:
        _CUDA_VSTD::__sort3<_Compare>(__first, __first+1, --__last, __comp);
        return true;
    case 4:
        _CUDA_VSTD::__sort4<_Compare>(__first, __first+1, __first+2, --__last, __comp);
        return true;
    case 5:
        _CUDA_VSTD::__sort5<_Compare>(__first, __first+1, __first+2, __first+3, --__last, __comp);
        return true;
    }
    typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
    _RandomAccessIterator __j = __first+2;
    __sort3<_Compare>(__first, __first+1, __j, __comp);
    const unsigned __limit = 8;
    unsigned __count = 0;
    for (_RandomAccessIterator __i = __j+1; __i != __last; ++__i)
    {
        if (__comp(*__i, *__j))
        {
            value_type __t(_CUDA_VSTD::move(*__i));
            _RandomAccessIterator __k = __j;
            __j = __i;
            do
            {
                *__j = _CUDA_VSTD::move(*__k);
                __j = __k;
            } while (__j != __first && __comp(__t, *--__k));
            *__j = _CUDA_VSTD::move(__t);
            if (++__count == __limit)
                return ++__i == __last;
        }
        __j = __i;
    }
    return true;
}

template <class _Compare, class _BirdirectionalIterator>
_LIBCUDACXX_HOST_DEVICE
void
__insertion_sort_move(_BirdirectionalIterator __first1, _BirdirectionalIterator __last1,
                      typename iterator_traits<_BirdirectionalIterator>::value_type* __first2, _Compare __comp)
{
    typedef typename iterator_traits<_BirdirectionalIterator>::value_type value_type;
    if (__first1 != __last1)
    {
        __destruct_n __d(0);
        unique_ptr<value_type, __destruct_n&> __h(__first2, __d);
        value_type* __last2 = __first2;
        ::new(__last2) value_type(_CUDA_VSTD::move(*__first1));
        __d.__incr((value_type*)0);
        for (++__last2; ++__first1 != __last1; ++__last2)
        {
            value_type* __j2 = __last2;
            value_type* __i2 = __j2;
            if (__comp(*__first1, *--__i2))
            {
                ::new(__j2) value_type(_CUDA_VSTD::move(*__i2));
                __d.__incr((value_type*)0);
                for (--__j2; __i2 != __first2 && __comp(*__first1,  *--__i2); --__j2)
                    *__j2 = _CUDA_VSTD::move(*__i2);
                *__j2 = _CUDA_VSTD::move(*__first1);
            }
            else
            {
                ::new(__j2) value_type(_CUDA_VSTD::move(*__first1));
                __d.__incr((value_type*)0);
            }
        }
        __h.release();
    }
}

template <class _Compare, class _RandomAccessIterator>
_LIBCUDACXX_HOST_DEVICE
void
__sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
{
    // _Compare is known to be a reference type
    typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
    typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
    const difference_type __limit = is_trivially_copy_constructible<value_type>::value &&
                                    is_trivially_copy_assignable<value_type>::value ? 30 : 6;
    while (true)
    {
    __restart:
        difference_type __len = __last - __first;
        switch (__len)
        {
        case 0:
        case 1:
            return;
        case 2:
            if (__comp(*--__last, *__first))
                swap(*__first, *__last);
            return;
        case 3:
            _CUDA_VSTD::__sort3<_Compare>(__first, __first+1, --__last, __comp);
            return;
        case 4:
            _CUDA_VSTD::__sort4<_Compare>(__first, __first+1, __first+2, --__last, __comp);
            return;
        case 5:
            _CUDA_VSTD::__sort5<_Compare>(__first, __first+1, __first+2, __first+3, --__last, __comp);
            return;
        }
        if (__len <= __limit)
        {
            _CUDA_VSTD::__insertion_sort_3<_Compare>(__first, __last, __comp);
            return;
        }
        // __len > 5
        _RandomAccessIterator __m = __first;
        _RandomAccessIterator __lm1 = __last;
        --__lm1;
        unsigned __n_swaps;
        {
        difference_type __delta;
        if (__len >= 1000)
        {
            __delta = __len/2;
            __m += __delta;
            __delta /= 2;
            __n_swaps = _CUDA_VSTD::__sort5<_Compare>(__first, __first + __delta, __m, __m+__delta, __lm1, __comp);
        }
        else
        {
            __delta = __len/2;
            __m += __delta;
            __n_swaps = _CUDA_VSTD::__sort3<_Compare>(__first, __m, __lm1, __comp);
        }
        }
        // *__m is median
        // partition [__first, __m) < *__m and *__m <= [__m, __last)
        // (this inhibits tossing elements equivalent to __m around unnecessarily)
        _RandomAccessIterator __i = __first;
        _RandomAccessIterator __j = __lm1;
        // j points beyond range to be tested, *__m is known to be <= *__lm1
        // The search going up is known to be guarded but the search coming down isn't.
        // Prime the downward search with a guard.
        if (!__comp(*__i, *__m))  // if *__first == *__m
        {
            // *__first == *__m, *__first doesn't go in first part
            // manually guard downward moving __j against __i
            while (true)
            {
                if (__i == --__j)
                {
                    // *__first == *__m, *__m <= all other elements
                    // Parition instead into [__first, __i) == *__first and *__first < [__i, __last)
                    ++__i;  // __first + 1
                    __j = __last;
                    if (!__comp(*__first, *--__j))  // we need a guard if *__first == *(__last-1)
                    {
                        while (true)
                        {
                            if (__i == __j)
                                return;  // [__first, __last) all equivalent elements
                            if (__comp(*__first, *__i))
                            {
                                swap(*__i, *__j);
                                ++__n_swaps;
                                ++__i;
                                break;
                            }
                            ++__i;
                        }
                    }
                    // [__first, __i) == *__first and *__first < [__j, __last) and __j == __last - 1
                    if (__i == __j)
                        return;
                    while (true)
                    {
                        while (!__comp(*__first, *__i))
                            ++__i;
                        while (__comp(*__first, *--__j))
                            ;
                        if (__i >= __j)
                            break;
                        swap(*__i, *__j);
                        ++__n_swaps;
                        ++__i;
                    }
                    // [__first, __i) == *__first and *__first < [__i, __last)
                    // The first part is sorted, sort the secod part
                    // _CUDA_VSTD::__sort<_Compare>(__i, __last, __comp);
                    __first = __i;
                    goto __restart;
                }
                if (__comp(*__j, *__m))
                {
                    swap(*__i, *__j);
                    ++__n_swaps;
                    break;  // found guard for downward moving __j, now use unguarded partition
                }
            }
        }
        // It is known that *__i < *__m
        ++__i;
        // j points beyond range to be tested, *__m is known to be <= *__lm1
        // if not yet partitioned...
        if (__i < __j)
        {
            // known that *(__i - 1) < *__m
            // known that __i <= __m
            while (true)
            {
                // __m still guards upward moving __i
                while (__comp(*__i, *__m))
                    ++__i;
                // It is now known that a guard exists for downward moving __j
                while (!__comp(*--__j, *__m))
                    ;
                if (__i > __j)
                    break;
                swap(*__i, *__j);
                ++__n_swaps;
                // It is known that __m != __j
                // If __m just moved, follow it
                if (__m == __i)
                    __m = __j;
                ++__i;
            }
        }
        // [__first, __i) < *__m and *__m <= [__i, __last)
        if (__i != __m && __comp(*__m, *__i))
        {
            swap(*__i, *__m);
            ++__n_swaps;
        }
        // [__first, __i) < *__i and *__i <= [__i+1, __last)
        // If we were given a perfect partition, see if insertion sort is quick...
        if (__n_swaps == 0)
        {
            bool __fs = _CUDA_VSTD::__insertion_sort_incomplete<_Compare>(__first, __i, __comp);
            if (_CUDA_VSTD::__insertion_sort_incomplete<_Compare>(__i+1, __last, __comp))
            {
                if (__fs)
                    return;
                __last = __i;
                continue;
            }
            else
            {
                if (__fs)
                {
                    __first = ++__i;
                    continue;
                }
            }
        }
        // sort smaller range with recursive call and larger with tail recursion elimination
        if (__i - __first < __last - __i)
        {
            _CUDA_VSTD::__sort<_Compare>(__first, __i, __comp);
            // _CUDA_VSTD::__sort<_Compare>(__i+1, __last, __comp);
            __first = ++__i;
        }
        else
        {
            _CUDA_VSTD::__sort<_Compare>(__i+1, __last, __comp);
            // _CUDA_VSTD::__sort<_Compare>(__first, __i, __comp);
            __last = __i;
        }
    }
}

// This forwarder keeps the top call and the recursive calls using the same instantiation, forcing a reference _Compare
template <class _RandomAccessIterator, class _Compare>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
{
    using _Comp_ref = __comp_ref_type<_Compare>;
    _CUDA_VSTD::__sort<_Comp_ref>(__first, __last, _Comp_ref(__comp));
}

template <class _RandomAccessIterator>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
{
    _CUDA_VSTD::sort(__first, __last, __less{});
}

template <class _Tp>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
sort(_Tp** __first, _Tp** __last)
{
    _CUDA_VSTD::sort((size_t*)__first, (size_t*)__last, __less{});
}

template <class _Tp>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
sort(__wrap_iter<_Tp*> __first, __wrap_iter<_Tp*> __last)
{
    _CUDA_VSTD::sort(__first.base(), __last.base());
}

template <class _Tp, class _Compare>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
sort(__wrap_iter<_Tp*> __first, __wrap_iter<_Tp*> __last, _Compare __comp)
{
    typedef __add_lvalue_reference_t<_Compare> _Comp_ref;
    _CUDA_VSTD::sort<_Tp*, _Comp_ref>(__first.base(), __last.base(), __comp);
}

_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, char*>(char*, char*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, signed*>(signed*, signed*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, unsigned char*>(unsigned char*, unsigned char*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, short*>(short*, short*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, unsigned short*>(unsigned short*, unsigned short*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, int*>(int*, int*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, unsigned*>(unsigned*, unsigned*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, long*>(long*, long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, unsigned long*>(unsigned long*, unsigned long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, long long*>(long long*, long long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, unsigned long long*>(unsigned long long*, unsigned long long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, float*>(float*, float*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, double*>(double*, double*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS void __sort<__less&, long double*>(long double*, long double*, __less&))

_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, char*>(char*, char*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, signed char*>(signed char*, signed char*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, unsigned char*>(unsigned char*, unsigned char*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, short*>(short*, short*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, unsigned short*>(unsigned short*, unsigned short*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, int*>(int*, int*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, unsigned*>(unsigned*, unsigned*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, long*>(long*, long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, unsigned long*>(unsigned long*, unsigned long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, long long*>(long long*, long long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, unsigned long long*>(unsigned long long*, unsigned long long*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, float*>(float*, float*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, double*>(double*, double*, __less&))
_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS bool __insertion_sort_incomplete<__less&, long double*>(long double*, long double*, __less&))

_LIBCUDACXX_EXTERN_TEMPLATE(_LIBCUDACXX_FUNC_VIS unsigned __sort5<__less&, long double*>(long double*, long double*, long double*, long double*, long double*, __less&))

// inplace_merge

template <class _Compare, class _InputIterator1, class _InputIterator2,
          class _OutputIterator>
_LIBCUDACXX_HOST_DEVICE
void __half_inplace_merge(_InputIterator1 __first1, _InputIterator1 __last1,
                          _InputIterator2 __first2, _InputIterator2 __last2,
                          _OutputIterator __result, _Compare __comp)
{
    for (; __first1 != __last1; ++__result)
    {
        if (__first2 == __last2)
        {
            _CUDA_VSTD::move(__first1, __last1, __result);
            return;
        }

        if (__comp(*__first2, *__first1))
        {
            *__result = _CUDA_VSTD::move(*__first2);
            ++__first2;
        }
        else
        {
            *__result = _CUDA_VSTD::move(*__first1);
            ++__first1;
        }
    }
    // __first2 through __last2 are already in the right spot.
}

template <class _Compare, class _BidirectionalIterator>
_LIBCUDACXX_HOST_DEVICE
void
__buffered_inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last,
                _Compare __comp, typename iterator_traits<_BidirectionalIterator>::difference_type __len1,
                                 typename iterator_traits<_BidirectionalIterator>::difference_type __len2,
                typename iterator_traits<_BidirectionalIterator>::value_type* __buff)
{
    typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
    __destruct_n __d(0);
    unique_ptr<value_type, __destruct_n&> __h2(__buff, __d);
    if (__len1 <= __len2)
    {
        value_type* __p = __buff;
        for (_BidirectionalIterator __i = __first; __i != __middle; __d.__incr((value_type*)0), (void) ++__i, (void) ++__p)
            ::new(__p) value_type(_CUDA_VSTD::move(*__i));
        __half_inplace_merge(__buff, __p, __middle, __last, __first, __comp);
    }
    else
    {
        value_type* __p = __buff;
        for (_BidirectionalIterator __i = __middle; __i != __last; __d.__incr((value_type*)0), (void) ++__i, (void) ++__p)
            ::new(__p) value_type(_CUDA_VSTD::move(*__i));
        typedef reverse_iterator<_BidirectionalIterator> _RBi;
        typedef reverse_iterator<value_type*> _Rv;
        __half_inplace_merge(_Rv(__p), _Rv(__buff),
                             _RBi(__middle), _RBi(__first),
                             _RBi(__last), __invert<_Compare>(__comp));
    }
}

template <class _Compare, class _BidirectionalIterator>
_LIBCUDACXX_HOST_DEVICE
void
__inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last,
                _Compare __comp, typename iterator_traits<_BidirectionalIterator>::difference_type __len1,
                                 typename iterator_traits<_BidirectionalIterator>::difference_type __len2,
                typename iterator_traits<_BidirectionalIterator>::value_type* __buff, ptrdiff_t __buff_size)
{
    typedef typename iterator_traits<_BidirectionalIterator>::difference_type difference_type;
    while (true)
    {
        // if __middle == __last, we're done
        if (__len2 == 0)
            return;
        if (__len1 <= __buff_size || __len2 <= __buff_size)
            return __buffered_inplace_merge<_Compare>
                   (__first, __middle, __last, __comp, __len1, __len2, __buff);
        // shrink [__first, __middle) as much as possible (with no moves), returning if it shrinks to 0
        for (; true; ++__first, (void) --__len1)
        {
            if (__len1 == 0)
                return;
            if (__comp(*__middle, *__first))
                break;
        }
        // __first < __middle < __last
        // *__first > *__middle
        // partition [__first, __m1) [__m1, __middle) [__middle, __m2) [__m2, __last) such that
        //     all elements in:
        //         [__first, __m1)  <= [__middle, __m2)
        //         [__middle, __m2) <  [__m1, __middle)
        //         [__m1, __middle) <= [__m2, __last)
        //     and __m1 or __m2 is in the middle of its range
        _BidirectionalIterator __m1;  // "median" of [__first, __middle)
        _BidirectionalIterator __m2;  // "median" of [__middle, __last)
        difference_type __len11;      // distance(__first, __m1)
        difference_type __len21;      // distance(__middle, __m2)
        // binary search smaller range
        if (__len1 < __len2)
        {   // __len >= 1, __len2 >= 2
            __len21 = __len2 / 2;
            __m2 = __middle;
            _CUDA_VSTD::advance(__m2, __len21);
            __m1 = __upper_bound<_Compare>(__first, __middle, *__m2, __comp);
            __len11 = _CUDA_VSTD::distance(__first, __m1);
        }
        else
        {
            if (__len1 == 1)
            {   // __len1 >= __len2 && __len2 > 0, therefore __len2 == 1
                // It is known *__first > *__middle
                swap(*__first, *__middle);
                return;
            }
            // __len1 >= 2, __len2 >= 1
            __len11 = __len1 / 2;
            __m1 = __first;
            _CUDA_VSTD::advance(__m1, __len11);
            __m2 = __lower_bound<_Compare>(__middle, __last, *__m1, __comp);
            __len21 = _CUDA_VSTD::distance(__middle, __m2);
        }
        difference_type __len12 = __len1 - __len11;  // distance(__m1, __middle)
        difference_type __len22 = __len2 - __len21;  // distance(__m2, __last)
        // [__first, __m1) [__m1, __middle) [__middle, __m2) [__m2, __last)
        // swap middle two partitions
        __middle = _CUDA_VSTD::rotate(__m1, __middle, __m2);
        // __len12 and __len21 now have swapped meanings
        // merge smaller range with recurisve call and larger with tail recursion elimination
        if (__len11 + __len21 < __len12 + __len22)
        {
            __inplace_merge<_Compare>(__first, __m1, __middle, __comp, __len11, __len21, __buff, __buff_size);
//          __inplace_merge<_Compare>(__middle, __m2, __last, __comp, __len12, __len22, __buff, __buff_size);
            __first = __middle;
            __middle = __m2;
            __len1 = __len12;
            __len2 = __len22;
        }
        else
        {
            __inplace_merge<_Compare>(__middle, __m2, __last, __comp, __len12, __len22, __buff, __buff_size);
//          __inplace_merge<_Compare>(__first, __m1, __middle, __comp, __len11, __len21, __buff, __buff_size);
            __last = __middle;
            __middle = __m1;
            __len1 = __len11;
            __len2 = __len21;
        }
    }
}

template <class _BidirectionalIterator, class _Compare>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last,
              _Compare __comp)
{
    typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
    typedef typename iterator_traits<_BidirectionalIterator>::difference_type difference_type;
    difference_type __len1 = _CUDA_VSTD::distance(__first, __middle);
    difference_type __len2 = _CUDA_VSTD::distance(__middle, __last);
    difference_type __buf_size = _CUDA_VSTD::min(__len1, __len2);
    pair<value_type*, ptrdiff_t> __buf = _CUDA_VSTD::get_temporary_buffer<value_type>(__buf_size);
    unique_ptr<value_type, __return_temporary_buffer> __h(__buf.first);
    using _Comp_ref = __comp_ref_type<_Compare>;
    return _CUDA_VSTD::__inplace_merge<_Comp_ref>(__first, __middle, __last, __comp, __len1, __len2,
                                            __buf.first, __buf.second);
}

template <class _BidirectionalIterator>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last)
{
    _CUDA_VSTD::inplace_merge(__first, __middle, __last,
                        __less{});
}

// stable_sort

template <class _Compare, class _InputIterator1, class _InputIterator2>
_LIBCUDACXX_HOST_DEVICE
void
__merge_move_construct(_InputIterator1 __first1, _InputIterator1 __last1,
        _InputIterator2 __first2, _InputIterator2 __last2,
        typename iterator_traits<_InputIterator1>::value_type* __result, _Compare __comp)
{
    typedef typename iterator_traits<_InputIterator1>::value_type value_type;
    __destruct_n __d(0);
    unique_ptr<value_type, __destruct_n&> __h(__result, __d);
    for (; true; ++__result)
    {
        if (__first1 == __last1)
        {
            for (; __first2 != __last2; ++__first2, ++__result, (void) __d.__incr((value_type*)0))
                ::new (__result) value_type(_CUDA_VSTD::move(*__first2));
            __h.release();
            return;
        }
        if (__first2 == __last2)
        {
            for (; __first1 != __last1; ++__first1, ++__result, (void) __d.__incr((value_type*)0))
                ::new (__result) value_type(_CUDA_VSTD::move(*__first1));
            __h.release();
            return;
        }
        if (__comp(*__first2, *__first1))
        {
            ::new (__result) value_type(_CUDA_VSTD::move(*__first2));
            __d.__incr((value_type*)0);
            ++__first2;
        }
        else
        {
            ::new (__result) value_type(_CUDA_VSTD::move(*__first1));
            __d.__incr((value_type*)0);
            ++__first1;
        }
    }
}

template <class _Compare, class _InputIterator1, class _InputIterator2, class _OutputIterator>
_LIBCUDACXX_HOST_DEVICE
void
__merge_move_assign(_InputIterator1 __first1, _InputIterator1 __last1,
        _InputIterator2 __first2, _InputIterator2 __last2,
        _OutputIterator __result, _Compare __comp)
{
    for (; __first1 != __last1; ++__result)
    {
        if (__first2 == __last2)
        {
            for (; __first1 != __last1; ++__first1, (void) ++__result)
                *__result = _CUDA_VSTD::move(*__first1);
            return;
        }
        if (__comp(*__first2, *__first1))
        {
            *__result = _CUDA_VSTD::move(*__first2);
            ++__first2;
        }
        else
        {
            *__result = _CUDA_VSTD::move(*__first1);
            ++__first1;
        }
    }
    for (; __first2 != __last2; ++__first2, (void) ++__result)
        *__result = _CUDA_VSTD::move(*__first2);
}

template <class _Compare, class _RandomAccessIterator>
_LIBCUDACXX_HOST_DEVICE
void
__stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp,
              typename iterator_traits<_RandomAccessIterator>::difference_type __len,
              typename iterator_traits<_RandomAccessIterator>::value_type* __buff, ptrdiff_t __buff_size);

template <class _Compare, class _RandomAccessIterator>
_LIBCUDACXX_HOST_DEVICE
void
__stable_sort_move(_RandomAccessIterator __first1, _RandomAccessIterator __last1, _Compare __comp,
                   typename iterator_traits<_RandomAccessIterator>::difference_type __len,
                   typename iterator_traits<_RandomAccessIterator>::value_type* __first2)
{
    typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
    switch (__len)
    {
    case 0:
        return;
    case 1:
        ::new(__first2) value_type(_CUDA_VSTD::move(*__first1));
        return;
    case 2:
        __destruct_n __d(0);
        unique_ptr<value_type, __destruct_n&> __h2(__first2, __d);
        if (__comp(*--__last1, *__first1))
        {
            ::new(__first2) value_type(_CUDA_VSTD::move(*__last1));
            __d.__incr((value_type*)0);
            ++__first2;
            ::new(__first2) value_type(_CUDA_VSTD::move(*__first1));
        }
        else
        {
            ::new(__first2) value_type(_CUDA_VSTD::move(*__first1));
            __d.__incr((value_type*)0);
            ++__first2;
            ::new(__first2) value_type(_CUDA_VSTD::move(*__last1));
        }
        __h2.release();
        return;
    }
    if (__len <= 8)
    {
        __insertion_sort_move<_Compare>(__first1, __last1, __first2, __comp);
        return;
    }
    typename iterator_traits<_RandomAccessIterator>::difference_type __l2 = __len / 2;
    _RandomAccessIterator __m = __first1 + __l2;
    __stable_sort<_Compare>(__first1, __m, __comp, __l2, __first2, __l2);
    __stable_sort<_Compare>(__m, __last1, __comp, __len - __l2, __first2 + __l2, __len - __l2);
    __merge_move_construct<_Compare>(__first1, __m, __m, __last1, __first2, __comp);
}

template <class _Tp>
struct __stable_sort_switch
{
    static const unsigned value = 128*is_trivially_copy_assignable<_Tp>::value;
};

template <class _Compare, class _RandomAccessIterator>
_LIBCUDACXX_HOST_DEVICE
void
__stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp,
              typename iterator_traits<_RandomAccessIterator>::difference_type __len,
              typename iterator_traits<_RandomAccessIterator>::value_type* __buff, ptrdiff_t __buff_size)
{
    typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
    typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
    switch (__len)
    {
    case 0:
    case 1:
        return;
    case 2:
        if (__comp(*--__last, *__first))
            swap(*__first, *__last);
        return;
    }
    if (__len <= static_cast<difference_type>(__stable_sort_switch<value_type>::value))
    {
        __insertion_sort<_Compare>(__first, __last, __comp);
        return;
    }
    typename iterator_traits<_RandomAccessIterator>::difference_type __l2 = __len / 2;
    _RandomAccessIterator __m = __first + __l2;
    if (__len <= __buff_size)
    {
        __destruct_n __d(0);
        unique_ptr<value_type, __destruct_n&> __h2(__buff, __d);
        __stable_sort_move<_Compare>(__first, __m, __comp, __l2, __buff);
        __d.__set(__l2, (value_type*)0);
        __stable_sort_move<_Compare>(__m, __last, __comp, __len - __l2, __buff + __l2);
        __d.__set(__len, (value_type*)0);
        __merge_move_assign<_Compare>(__buff, __buff + __l2, __buff + __l2, __buff + __len, __first, __comp);
//         __merge<_Compare>(move_iterator<value_type*>(__buff),
//                           move_iterator<value_type*>(__buff + __l2),
//                           move_iterator<_RandomAccessIterator>(__buff + __l2),
//                           move_iterator<_RandomAccessIterator>(__buff + __len),
//                           __first, __comp);
        return;
    }
    __stable_sort<_Compare>(__first, __m, __comp, __l2, __buff, __buff_size);
    __stable_sort<_Compare>(__m, __last, __comp, __len - __l2, __buff, __buff_size);
    __inplace_merge<_Compare>(__first, __m, __last, __comp, __l2, __len - __l2, __buff, __buff_size);
}

template <class _RandomAccessIterator, class _Compare>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp)
{
    typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
    typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
    difference_type __len = __last - __first;
    pair<value_type*, ptrdiff_t> __buf(0, 0);
    unique_ptr<value_type, __return_temporary_buffer> __h;
    if (__len > static_cast<difference_type>(__stable_sort_switch<value_type>::value))
    {
        __buf = _CUDA_VSTD::get_temporary_buffer<value_type>(__len);
        __h.reset(__buf.first);
    }
    using _Comp_ref = __comp_ref_type<_Compare>;
    __stable_sort<_Comp_ref>(__first, __last, __comp, __len, __buf.first, __buf.second);
}

template <class _RandomAccessIterator>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
{
    _CUDA_VSTD::stable_sort(__first, __last, __less{});
}

// nth_element

template <class _Compare, class _RandomAccessIterator>
_LIBCUDACXX_HOST_DEVICE
void
__nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last, _Compare __comp)
{
    // _Compare is known to be a reference type
    typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
    const difference_type __limit = 7;
    while (true)
    {
    __restart:
        if (__nth == __last)
            return;
        difference_type __len = __last - __first;
        switch (__len)
        {
        case 0:
        case 1:
            return;
        case 2:
            if (__comp(*--__last, *__first))
                swap(*__first, *__last);
            return;
        case 3:
            {
            _RandomAccessIterator __m = __first;
            _CUDA_VSTD::__sort3<_Compare>(__first, ++__m, --__last, __comp);
            return;
            }
        }
        if (__len <= __limit)
        {
            __selection_sort<_Compare>(__first, __last, __comp);
            return;
        }
        // __len > __limit >= 3
        _RandomAccessIterator __m = __first + __len/2;
        _RandomAccessIterator __lm1 = __last;
        unsigned __n_swaps = _CUDA_VSTD::__sort3<_Compare>(__first, __m, --__lm1, __comp);
        // *__m is median
        // partition [__first, __m) < *__m and *__m <= [__m, __last)
        // (this inhibits tossing elements equivalent to __m around unnecessarily)
        _RandomAccessIterator __i = __first;
        _RandomAccessIterator __j = __lm1;
        // j points beyond range to be tested, *__lm1 is known to be <= *__m
        // The search going up is known to be guarded but the search coming down isn't.
        // Prime the downward search with a guard.
        if (!__comp(*__i, *__m))  // if *__first == *__m
        {
            // *__first == *__m, *__first doesn't go in first part
            // manually guard downward moving __j against __i
            while (true)
            {
                if (__i == --__j)
                {
                    // *__first == *__m, *__m <= all other elements
                    // Parition instead into [__first, __i) == *__first and *__first < [__i, __last)
                    ++__i;  // __first + 1
                    __j = __last;
                    if (!__comp(*__first, *--__j))  // we need a guard if *__first == *(__last-1)
                    {
                        while (true)
                        {
                            if (__i == __j)
                                return;  // [__first, __last) all equivalent elements
                            if (__comp(*__first, *__i))
                            {
                                swap(*__i, *__j);
                                ++__n_swaps;
                                ++__i;
                                break;
                            }
                            ++__i;
                        }
                    }
                    // [__first, __i) == *__first and *__first < [__j, __last) and __j == __last - 1
                    if (__i == __j)
                        return;
                    while (true)
                    {
                        while (!__comp(*__first, *__i))
                            ++__i;
                        while (__comp(*__first, *--__j))
                            ;
                        if (__i >= __j)
                            break;
                        swap(*__i, *__j);
                        ++__n_swaps;
                        ++__i;
                    }
                    // [__first, __i) == *__first and *__first < [__i, __last)
                    // The first part is sorted,
                    if (__nth < __i)
                        return;
                    // __nth_element the secod part
                    // __nth_element<_Compare>(__i, __nth, __last, __comp);
                    __first = __i;
                    goto __restart;
                }
                if (__comp(*__j, *__m))
                {
                    swap(*__i, *__j);
                    ++__n_swaps;
                    break;  // found guard for downward moving __j, now use unguarded partition
                }
            }
        }
        ++__i;
        // j points beyond range to be tested, *__lm1 is known to be <= *__m
        // if not yet partitioned...
        if (__i < __j)
        {
            // known that *(__i - 1) < *__m
            while (true)
            {
                // __m still guards upward moving __i
                while (__comp(*__i, *__m))
                    ++__i;
                // It is now known that a guard exists for downward moving __j
                while (!__comp(*--__j, *__m))
                    ;
                if (__i >= __j)
                    break;
                swap(*__i, *__j);
                ++__n_swaps;
                // It is known that __m != __j
                // If __m just moved, follow it
                if (__m == __i)
                    __m = __j;
                ++__i;
            }
        }
        // [__first, __i) < *__m and *__m <= [__i, __last)
        if (__i != __m && __comp(*__m, *__i))
        {
            swap(*__i, *__m);
            ++__n_swaps;
        }
        // [__first, __i) < *__i and *__i <= [__i+1, __last)
        if (__nth == __i)
            return;
        if (__n_swaps == 0)
        {
            // We were given a perfectly partitioned sequence.  Coincidence?
            if (__nth < __i)
            {
                // Check for [__first, __i) already sorted
                __j = __m = __first;
                while (++__j != __i)
                {
                    if (__comp(*__j, *__m))
                        // not yet sorted, so sort
                        goto not_sorted;
                    __m = __j;
                }
                // [__first, __i) sorted
                return;
            }
            else
            {
                // Check for [__i, __last) already sorted
                __j = __m = __i;
                while (++__j != __last)
                {
                    if (__comp(*__j, *__m))
                        // not yet sorted, so sort
                        goto not_sorted;
                    __m = __j;
                }
                // [__i, __last) sorted
                return;
            }
        }
not_sorted:
        // __nth_element on range containing __nth
        if (__nth < __i)
        {
            // __nth_element<_Compare>(__first, __nth, __i, __comp);
            __last = __i;
        }
        else
        {
            // __nth_element<_Compare>(__i+1, __nth, __last, __comp);
            __first = ++__i;
        }
    }
}

template <class _RandomAccessIterator, class _Compare>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last, _Compare __comp)
{
    using _Comp_ref = __comp_ref_type<_Compare>;
    __nth_element<_Comp_ref>(__first, __nth, __last, __comp);
}

template <class _RandomAccessIterator>
inline _LIBCUDACXX_INLINE_VISIBILITY
void
nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last)
{
    _CUDA_VSTD::nth_element(__first, __nth, __last, __less{});
}

#endif
_LIBCUDACXX_END_NAMESPACE_STD

#ifndef __cuda_std__
_LIBCUDACXX_POP_MACROS
#endif

#if defined(_LIBCUDACXX_HAS_PARALLEL_ALGORITHMS) && _CCCL_STD_VER >= 2017
#   include <__pstl_algorithm>
#endif

#endif  // _LIBCUDACXX_ALGORITHM
