From d84e8f3107d7ab9fb94441f372562955cdf9b963 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@heterodb.com>
Date: Fri, 7 Feb 2025 00:44:16 +0900
Subject: [PATCH] gpusort: initial workable GpuSort on top of GpuPreAgg

---
 src/codegen.c       |   8 +-
 src/cuda_gpusort.cu | 543 ++++++++++++++++++++++++++++++++++++++------
 src/gpu_join.c      |   4 +-
 src/gpu_preagg.c    |   4 +-
 src/gpu_service.c   |  67 +++++-
 src/xpu_common.h    |   4 +-
 6 files changed, 536 insertions(+), 94 deletions(-)

diff --git a/src/codegen.c b/src/codegen.c
index 9a3561a0..5db00f52 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -4035,7 +4035,7 @@ codegen_build_gpusort_keydesc(codegen_context *context,
 
 		keydesc->kind = kind;
 		keydesc->nulls_first = ((ival & KSORT_KEY_ATTR__NULLS_FIRST) != 0);
-		keydesc->desc_order  = ((ival & KSORT_KEY_ATTR__DESC_ORDER)  != 0);
+		keydesc->order_asc   = ((ival & KSORT_KEY_ATTR__ORDER_ASC)  != 0);
 		if (kind == KSORT_KEY_KIND__VREF)
 		{
 			ListCell   *cell;
@@ -4673,9 +4673,9 @@ __xpucode_sortkeys_cstring(StringInfo buf,
 			else if (desc->src_anum != 0)
 				appendStringInfo(buf, "; key=(out of range)");
 		}
-		appendStringInfo(buf, "; nulls %s; %s order>",
-						 desc->nulls_first ? "first" : "last",
-						 desc->desc_order ? "desc" : "asc");
+		appendStringInfo(buf, "[%s;%s]",
+						 desc->nulls_first ? "NF" : "NL",
+						 desc->order_asc ? "ASC" : "DESC");
 	}
 }
 
diff --git a/src/cuda_gpusort.cu b/src/cuda_gpusort.cu
index bee0f4e5..e2a55514 100644
--- a/src/cuda_gpusort.cu
+++ b/src/cuda_gpusort.cu
@@ -12,6 +12,329 @@
 #include "cuda_common.h"
 #include "float2.h"
 
+INLINE_FUNCTION(int)
+__gpusort_comp_rawkey(kern_context *kcxt,
+					  const kern_sortkey_desc *sdesc,
+					  const kern_data_store *kds_final,
+					  const kern_tupitem *titem_x,
+					  const kern_tupitem *titem_y)
+{
+	const void *addr_x = kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum);
+	const void *addr_y = kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum);
+
+	if (addr_x && addr_y)
+	{
+		const xpu_datum_operators *key_ops = sdesc->key_ops;
+		xpu_datum_t	   *datum_x;
+		xpu_datum_t	   *datum_y;
+		int				sz, comp;
+
+		/*
+		 * !!!workaround for a bug!!!
+		 *
+		 * A couple of identical alloca() calls below were unintentionally
+		 * optimized by the compiler.
+		 * Probably, compiler considered that alloca() will return same
+		 * value for the identical argument, thus datum_x and datum_y will
+		 * have same value.
+		 *
+		 * datum_x = (xpu_datum_t *)alloca(key_ops->xpu_type_sizeof);
+		 * datum_y = (xpu_datum_t *)alloca(key_ops->xpu_type_sizeof);
+		 *
+		 * If alloca() would be an immutable function, it is a right assumption,
+		 * however, alloca() modified the current stack frame and allocates
+		 * a temporary buffer. So, datum_x and datum_y should be different
+		 * pointers.
+		 */
+		sz = TYPEALIGN(16, key_ops->xpu_type_sizeof);
+		datum_x = (xpu_datum_t *)alloca(2 * sz);
+		datum_y = (xpu_datum_t *)((char *)datum_x + sz);
+		if (key_ops->xpu_datum_heap_read(kcxt, addr_x, datum_x) &&
+			key_ops->xpu_datum_heap_read(kcxt, addr_y, datum_y) &&
+			key_ops->xpu_datum_comp(kcxt, &comp, datum_x, datum_y))
+			return sdesc->order_asc ? comp : -comp;
+	}
+	else if (addr_x && !addr_y)
+		return (sdesc->nulls_first ? 1 : -1);	/* X is NOT NULL, Y is NULL */
+	else if (!addr_x && addr_y)
+		return (sdesc->nulls_first ? -1 : 1);	/* X is NULL, Y is NOT NULL */
+	return 0;
+}
+
+INLINE_FUNCTION(int)
+__gpusort_comp_pminmax_int64(kern_context *kcxt,
+							 const kern_sortkey_desc *sdesc,
+							 const kern_data_store *kds_final,
+							 const kern_tupitem *titem_x,
+							 const kern_tupitem *titem_y)
+{
+	const kagg_state__pminmax_int64_packed *x = (const kagg_state__pminmax_int64_packed *)
+		kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum);
+	const kagg_state__pminmax_int64_packed *y = (const kagg_state__pminmax_int64_packed *)
+		kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum);
+	if (x && (x->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0)
+	{
+		if (y && (y->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0)
+		{
+			if (x->value < y->value)
+				return (sdesc->order_asc ? -1 : 1);
+			if (x->value > y->value)
+				return (sdesc->order_asc ? 1 : -1);
+			return 0;
+		}
+		else
+			return (sdesc->nulls_first ? 1 : -1);	/* X is NOT NULL, Y is NULL */
+	}
+	else if (y && (y->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0)
+		return (sdesc->nulls_first ? -1 : 1);		/* X is NULL, Y is NOT NULL */
+	return 0;
+}
+
+INLINE_FUNCTION(int)
+__gpusort_comp_pminmax_fp64(kern_context *kcxt,
+							const kern_sortkey_desc *sdesc,
+							const kern_data_store *kds_final,
+							const kern_tupitem *titem_x,
+							const kern_tupitem *titem_y)
+{
+	const kagg_state__pminmax_fp64_packed *x = (const kagg_state__pminmax_fp64_packed *)
+		kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum);
+	const kagg_state__pminmax_fp64_packed *y = (const kagg_state__pminmax_fp64_packed *)
+		kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum);
+	if (x && (x->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0)
+	{
+		if (y && (y->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0)
+		{
+			if (x->value < y->value)
+				return (sdesc->order_asc ? -1 : 1);
+			if (x->value > y->value)
+				return (sdesc->order_asc ? 1 : -1);
+			return 0;
+		}
+		else
+			return (sdesc->nulls_first ? 1 : -1);	/* X is NOT NULL, Y is NULL */
+	}
+	else if (y && (y->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0)
+		return (sdesc->nulls_first ? -1 : 1);		/* X is NULL, Y is NOT NULL */
+	return 0;
+}
+
+INLINE_FUNCTION(int)
+__gpusort_comp_psum_int64(kern_context *kcxt,
+						  const kern_sortkey_desc *sdesc,
+						  const kern_data_store *kds_final,
+						  const kern_tupitem *titem_x,
+						  const kern_tupitem *titem_y)
+{
+	const kagg_state__psum_int_packed *x = (const kagg_state__psum_int_packed *)
+		kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum);
+	const kagg_state__psum_int_packed *y = (const kagg_state__psum_int_packed *)
+		kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum);
+	if (x && x->nitems > 0)
+	{
+		if (y && y->nitems > 0)
+		{
+			if (x->sum < y->sum)
+				return (sdesc->order_asc ? -1 : 1);
+			if (x->sum > y->sum)
+				return (sdesc->order_asc ? 1 : -1);
+			return 0;
+		}
+		else
+			return (sdesc->nulls_first ? 1 : -1);	/* X!=NULL and Y==NULL */
+	}
+	else if (y && y->nitems > 0)
+		return (sdesc->nulls_first ? -1 : 1);		/* X==NULL and Y!=NULL */
+	return 0;
+}
+
+INLINE_FUNCTION(int)
+__gpusort_comp_psum_fp64(kern_context *kcxt,
+						  const kern_sortkey_desc *sdesc,
+						  const kern_data_store *kds_final,
+						  const kern_tupitem *titem_x,
+						  const kern_tupitem *titem_y)
+{
+	const kagg_state__psum_fp_packed *x = (const kagg_state__psum_fp_packed *)
+		kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum);
+	const kagg_state__psum_fp_packed *y = (const kagg_state__psum_fp_packed *)
+		kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum);
+	if (x && x->nitems > 0)
+	{
+		if (y && y->nitems > 0)
+		{
+			if (x->sum < y->sum)
+				return (sdesc->order_asc ? -1 : 1);
+			if (x->sum > y->sum)
+				return (sdesc->order_asc ? 1 : -1);
+			return 0;
+		}
+		else
+			return (sdesc->nulls_first ? 1 : -1);	/* X!=NULL and Y==NULL */
+	}
+	else if (y && y->nitems > 0)
+		return (sdesc->nulls_first ? -1 : 1);		/* X==NULL and Y!=NULL */
+	return 0;
+}
+
+INLINE_FUNCTION(int)
+__gpusort_comp_psum_numeric(kern_context *kcxt,
+							const kern_sortkey_desc *sdesc,
+							const kern_data_store *kds_final,
+							const kern_tupitem *titem_x,
+							const kern_tupitem *titem_y)
+{
+	const kagg_state__psum_numeric_packed *x = (const kagg_state__psum_numeric_packed *)
+		kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum);
+	const kagg_state__psum_numeric_packed *y = (const kagg_state__psum_numeric_packed *)
+		kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum);
+	if (x && x->nitems > 0)
+	{
+		if (y && y->nitems > 0)
+		{
+			xpu_numeric_t x_datum;
+			xpu_numeric_t y_datum;
+			int		xspecial = (x->attrs & __PAGG_NUMERIC_ATTRS__MASK);
+			int		yspecial = (y->attrs & __PAGG_NUMERIC_ATTRS__MASK);
+			int		comp;
+
+			if (xspecial == 0)
+			{
+				x_datum.kind = XPU_NUMERIC_KIND__VALID;
+				x_datum.weight = (int16_t)(x->attrs & __PAGG_NUMERIC_ATTRS__WEIGHT);
+				x_datum.u.value = __fetch_int128_packed(&x->sum);
+			}
+			else if (xspecial == __PAGG_NUMERIC_ATTRS__PINF)
+				x_datum.kind = XPU_NUMERIC_KIND__POS_INF;
+			else if (xspecial == __PAGG_NUMERIC_ATTRS__NINF)
+				x_datum.kind = XPU_NUMERIC_KIND__NEG_INF;
+			else
+				x_datum.kind = XPU_NUMERIC_KIND__NAN;
+			x_datum.expr_ops = &xpu_numeric_ops;
+
+			if (yspecial == 0)
+			{
+				y_datum.kind = XPU_NUMERIC_KIND__VALID;
+				y_datum.weight = (int16_t)(y->attrs & __PAGG_NUMERIC_ATTRS__WEIGHT);
+				y_datum.u.value = __fetch_int128_packed(&y->sum);
+			}
+			else if (yspecial == __PAGG_NUMERIC_ATTRS__PINF)
+				y_datum.kind = XPU_NUMERIC_KIND__POS_INF;
+			else if (yspecial == __PAGG_NUMERIC_ATTRS__NINF)
+				y_datum.kind = XPU_NUMERIC_KIND__NEG_INF;
+			else
+				y_datum.kind = XPU_NUMERIC_KIND__NAN;
+			y_datum.expr_ops = &xpu_numeric_ops;
+
+			sdesc->key_ops->xpu_datum_comp(kcxt,
+										   &comp,
+										   (xpu_datum_t *)&x_datum,
+										   (xpu_datum_t *)&y_datum);
+			return (sdesc->order_asc ? comp : -comp);
+		}
+		else
+			return (sdesc->nulls_first ? 1 : -1)	/* X!=NULL, Y==NULL */;
+	}
+	else if (y && y->nitems > 0)
+		return (sdesc->nulls_first ? -1 : 1);		/* X==NULL, Y!=NULL */
+	return 0;
+}
+
+INLINE_FUNCTION(int)
+__gpusort_comp_precomp_fp64(const kern_sortkey_desc *sdesc,
+							const kern_tupitem *titem_x,
+							const kern_tupitem *titem_y)
+{
+	const char *addr_x = ((char *)&titem_x->htup + titem_x->t_len + sdesc->buf_offset);
+	const char *addr_y = ((char *)&titem_y->htup + titem_y->t_len + sdesc->buf_offset);
+	bool		notnull_x = *addr_x++;
+	bool		notnull_y = *addr_y++;
+	float8_t	fval_x;
+	float8_t	fval_y;
+
+	if (notnull_x && notnull_y)
+	{
+		memcpy(&fval_x, addr_x, sizeof(float8_t));
+		memcpy(&fval_y, addr_y, sizeof(float8_t));
+		if (fval_x < fval_y)
+			return (sdesc->order_asc ? -1 : 1);
+		if (fval_x > fval_y)
+			return (sdesc->order_asc ? 1 : -1);
+	}
+	else if (notnull_x && !notnull_y)
+		return (sdesc->nulls_first ? 1 : -1);
+	else if (!notnull_x && notnull_y)
+		return (sdesc->nulls_first ? -1 : 1);
+	return 0;
+}
+
+INLINE_FUNCTION(int)
+__gpusort_comp_keys(kern_context *kcxt,
+					const kern_expression *sort_kexp,
+					const kern_data_store *kds_final,
+					const kern_tupitem *titem_x,
+					const kern_tupitem *titem_y)
+{
+	if (!titem_x)
+		return (!titem_y ? 0 : 1);
+	else if (!titem_y)
+		return -1;
+
+	for (int k=0; k < sort_kexp->u.sort.nkeys; k++)
+	{
+		const kern_sortkey_desc *sdesc = &sort_kexp->u.sort.desc[k];
+		int		comp;
+
+		switch (sdesc->kind)
+		{
+			case KSORT_KEY_KIND__VREF:
+				comp = __gpusort_comp_rawkey(kcxt, sdesc, kds_final, titem_x, titem_y);
+				break;
+			case KSORT_KEY_KIND__PMINMAX_INT64:
+				comp = __gpusort_comp_pminmax_int64(kcxt, sdesc, kds_final, titem_x, titem_y);
+				break;
+			case KSORT_KEY_KIND__PMINMAX_FP64:
+				comp = __gpusort_comp_pminmax_fp64(kcxt, sdesc, kds_final, titem_x, titem_y);
+				break;
+			case KSORT_KEY_KIND__PSUM_INT64:
+				comp = __gpusort_comp_psum_int64(kcxt, sdesc, kds_final, titem_x, titem_y);
+				break;
+			case KSORT_KEY_KIND__PSUM_FP64:
+				comp = __gpusort_comp_psum_fp64(kcxt, sdesc, kds_final, titem_x, titem_y);
+				break;
+			case KSORT_KEY_KIND__PSUM_NUMERIC:
+				comp = __gpusort_comp_psum_numeric(kcxt, sdesc, kds_final, titem_x, titem_y);
+				break;
+			case KSORT_KEY_KIND__PAVG_INT64:
+			case KSORT_KEY_KIND__PAVG_FP64:
+			case KSORT_KEY_KIND__PAVG_NUMERIC:
+			case KSORT_KEY_KIND__PVARIANCE_SAMP:
+			case KSORT_KEY_KIND__PVARIANCE_POP:
+			case KSORT_KEY_KIND__PCOVAR_CORR:
+			case KSORT_KEY_KIND__PCOVAR_SAMP:
+			case KSORT_KEY_KIND__PCOVAR_POP:
+			case KSORT_KEY_KIND__PCOVAR_AVGX:
+			case KSORT_KEY_KIND__PCOVAR_AVGY:
+			case KSORT_KEY_KIND__PCOVAR_COUNT:
+			case KSORT_KEY_KIND__PCOVAR_INTERCEPT:
+			case KSORT_KEY_KIND__PCOVAR_REGR_R2:
+			case KSORT_KEY_KIND__PCOVAR_REGR_SLOPE:
+			case KSORT_KEY_KIND__PCOVAR_REGR_SXX:
+			case KSORT_KEY_KIND__PCOVAR_REGR_SXY:
+			case KSORT_KEY_KIND__PCOVAR_REGR_SYY:
+				/* pre-computed float8 values */
+				comp = __gpusort_comp_precomp_fp64(sdesc, titem_x, titem_y);
+				break;
+			default:
+				/* Bug? should not happen */
+				comp = 0;
+				break;
+		}
+		if (comp != 0)
+			return comp;
+	}
+	return 0;
+}
 
 /*
  * kern_gpusort_exec_bitonic
@@ -20,16 +343,64 @@ KERNEL_FUNCTION(void)
 kern_gpusort_exec_bitonic(kern_session_info *session,
 						  kern_gputask *kgtask,
 						  kern_data_store *kds_final,
-						  int step)
+						  uint32_t	nr_threads,
+						  uint64_t *row_index,
+						  int scale, int step)
 {
+	const kern_expression *sort_kexp = SESSION_KEXP_GPUSORT_KEYDESC(session);
+	const char	   *end = (const char *)kds_final + kds_final->length;
+	kern_context   *kcxt;
+	uint32_t		thread_id;
+
+	/* sanity checks */
+	assert(get_local_size() <= CUDA_MAXTHREADS_PER_BLOCK);
+	assert((nr_threads & (nr_threads-1)) == 0);
+	/* save the GPU-Task specific read-only properties */
+	if (get_local_id() == 0)
+	{
+		stromTaskProp__cuda_dindex        = kgtask->cuda_dindex;
+		stromTaskProp__cuda_stack_limit   = kgtask->cuda_stack_limit;
+		stromTaskProp__partition_divisor  = kgtask->partition_divisor;
+		stromTaskProp__partition_reminder = kgtask->partition_reminder;
+	}
+	/* setup execution context */
+	INIT_KERNEL_CONTEXT(kcxt, session, NULL);
+	if (!row_index)
+		row_index = KDS_GET_ROWINDEX(kds_final);
+	for (thread_id = get_global_id();
+		 thread_id < nr_threads;
+		 thread_id += get_global_size())
+	{
+		uint32_t	base = ((thread_id >> scale) << (scale+1));
+		uint32_t	m_bits = (thread_id & ((1U<<scale)-1)) >> step;
+        uint32_t	l_bits = (thread_id & ((1U<<step)-1));
+        uint32_t	index = base + (m_bits << (step+1)) + l_bits;
+        uint32_t	buddy = index + (1U << step);
+		bool		direction = (thread_id & (1U<<scale));
+		int			comp;
+		kern_tupitem *titem_x = NULL;
+		kern_tupitem *titem_y = NULL;
+
+		if (row_index[index] != 0)
+			titem_x = (kern_tupitem *)(end - row_index[index]);
+		if (row_index[buddy] != 0)
+			titem_y = (kern_tupitem *)(end - row_index[buddy]);
+		comp = __gpusort_comp_keys(kcxt, sort_kexp, kds_final, titem_x, titem_y);
+		if (direction ? comp < 0 : comp > 0)
+		{
+			uint64_t	temp = row_index[index];
 
+			row_index[index] = row_index[buddy];
+			row_index[buddy] = temp;
+		}
+	}
 }
 
-STATIC_FUNCTION(void)
-__gpusort_finalize_pavg_int64(kern_context *kcxt,
-							  kern_data_store *kds_final,
-							  kern_tupitem *titem,
-							  const kern_sortkey_desc *sdesc)
+INLINE_FUNCTION(void)
+__gpusort_prep_pavg_int64(kern_context *kcxt,
+						  const kern_data_store *kds_final,
+						  const kern_tupitem *titem,
+						  const kern_sortkey_desc *sdesc)
 {
 	const void *addr;
 	char	   *dest;
@@ -55,11 +426,11 @@ __gpusort_finalize_pavg_int64(kern_context *kcxt,
 	}
 }
 
-STATIC_FUNCTION(void)
-__gpusort_finalize_pavg_fp64(kern_context *kcxt,
-							 kern_data_store *kds_final,
-							 kern_tupitem *titem,
-							 const kern_sortkey_desc *sdesc)
+INLINE_FUNCTION(void)
+__gpusort_prep_pavg_fp64(kern_context *kcxt,
+						 const kern_data_store *kds_final,
+						 const kern_tupitem *titem,
+						 const kern_sortkey_desc *sdesc)
 {
 	const void *addr;
 	char	   *dest;
@@ -85,11 +456,11 @@ __gpusort_finalize_pavg_fp64(kern_context *kcxt,
 	}
 }
 
-STATIC_FUNCTION(void)
-__gpusort_finalize_pavg_numeric(kern_context *kcxt,
-								kern_data_store *kds_final,
-								kern_tupitem *titem,
-								const kern_sortkey_desc *sdesc)
+INLINE_FUNCTION(void)
+__gpusort_prep_pavg_numeric(kern_context *kcxt,
+							const kern_data_store *kds_final,
+							const kern_tupitem *titem,
+							const kern_sortkey_desc *sdesc)
 {
 	const void *addr;
 	char	   *dest;
@@ -111,6 +482,7 @@ __gpusort_finalize_pavg_numeric(kern_context *kcxt,
 			int16_t		weight  = (r->attrs & __PAGG_NUMERIC_ATTRS__WEIGHT);
 			float8_t	fval;
 
+			printf("not_null=%p\n", dest);
 			*dest++ = true;
 			if (special == 0)
 			{
@@ -162,18 +534,17 @@ __gpusort_finalize_pavg_numeric(kern_context *kcxt,
 				fval = -INFINITY;
 			else
 				fval = NAN;
-			printf("fval = %f\n", fval);
+			printf("fval = %f buf_offset=%d\n", fval, sdesc->buf_offset);
 			memcpy(dest, &fval, sizeof(float8_t));
 		}
 	}
 }
 
-
-STATIC_FUNCTION(void)
-__gpusort_finalize_pvariance(kern_context *kcxt,
-							 kern_data_store *kds_final,
-							 kern_tupitem *titem,
-							 const kern_sortkey_desc *sdesc)
+INLINE_FUNCTION(void)
+__gpusort_prep_pvariance(kern_context *kcxt,
+						 const kern_data_store *kds_final,
+						 const kern_tupitem *titem,
+						 const kern_sortkey_desc *sdesc)
 {
 	const void *addr;
 	char	   *dest;
@@ -228,11 +599,11 @@ __gpusort_finalize_pvariance(kern_context *kcxt,
 	}
 }
 
-STATIC_FUNCTION(void)
-__gpusort_finalize_pcovariance(kern_context *kcxt,
-							   kern_data_store *kds_final,
-							   kern_tupitem *titem,
-							   const kern_sortkey_desc *sdesc)
+INLINE_FUNCTION(void)
+__gpusort_prep_pcovariance(kern_context *kcxt,
+						   const kern_data_store *kds_final,
+						   const kern_tupitem *titem,
+						   const kern_sortkey_desc *sdesc)
 {
 	const void *addr;
 	char	   *dest;
@@ -344,20 +715,75 @@ __gpusort_finalize_pcovariance(kern_context *kcxt,
 }
 
 /*
- * kern_gpusort_finalize_buffer
+ * per-tuple preparation on demand
+ */
+INLINE_FUNCTION(void)
+__gpusort_prep_tupitem(kern_context *kcxt,
+					   const kern_expression *sort_kexp,
+					   const kern_data_store *kds_final,
+					   uint32_t kds_index)
+{
+	const kern_tupitem *titem = KDS_GET_TUPITEM(kds_final, kds_index);
+
+	for (int k=0; k < sort_kexp->u.sort.nkeys; k++)
+	{
+		const kern_sortkey_desc *sdesc = &sort_kexp->u.sort.desc[k];
+
+		switch (sdesc->kind)
+		{
+			case KSORT_KEY_KIND__PAVG_INT64:
+				__gpusort_prep_pavg_int64(kcxt, kds_final, titem, sdesc);
+				break;
+			case KSORT_KEY_KIND__PAVG_FP64:
+				__gpusort_prep_pavg_fp64(kcxt, kds_final, titem, sdesc);
+				break;
+			case KSORT_KEY_KIND__PAVG_NUMERIC:
+				__gpusort_prep_pavg_numeric(kcxt, kds_final, titem, sdesc);
+				break;
+			case KSORT_KEY_KIND__PVARIANCE_SAMP:
+			case KSORT_KEY_KIND__PVARIANCE_POP:
+				__gpusort_prep_pvariance(kcxt, kds_final, titem, sdesc);
+				break;
+			case KSORT_KEY_KIND__PCOVAR_CORR:
+			case KSORT_KEY_KIND__PCOVAR_SAMP:
+			case KSORT_KEY_KIND__PCOVAR_POP:
+			case KSORT_KEY_KIND__PCOVAR_AVGX:
+			case KSORT_KEY_KIND__PCOVAR_AVGY:
+			case KSORT_KEY_KIND__PCOVAR_COUNT:
+			case KSORT_KEY_KIND__PCOVAR_INTERCEPT:
+			case KSORT_KEY_KIND__PCOVAR_REGR_R2:
+			case KSORT_KEY_KIND__PCOVAR_REGR_SLOPE:
+			case KSORT_KEY_KIND__PCOVAR_REGR_SXX:
+			case KSORT_KEY_KIND__PCOVAR_REGR_SXY:
+			case KSORT_KEY_KIND__PCOVAR_REGR_SYY:
+				__gpusort_prep_pcovariance(kcxt, kds_final, titem, sdesc);
+				break;
+			default:
+				/* nothing to do */
+				break;
+		}
+	}
+}
+
+/*
+ * kern_gpusort_prep_buffer
  */
 KERNEL_FUNCTION(void)
-kern_gpusort_finalize_buffer(kern_session_info *session,
-							 kern_gputask *kgtask,
-							 kern_data_store *kds_final)
+kern_gpusort_prep_buffer(kern_session_info *session,
+						 kern_gputask *kgtask,
+						 kern_data_store *kds_final,
+						 uint32_t nr_threads,
+						 uint64_t *row_index)
 {
 	const kern_expression *sort_kexp = SESSION_KEXP_GPUSORT_KEYDESC(session);
 	kern_context   *kcxt;
+	uint32_t		nrooms = 2 * nr_threads;
 	uint32_t		index;
 
 	/* sanity checks */
 	assert(get_local_size() <= CUDA_MAXTHREADS_PER_BLOCK);
-	assert(sort_kexp->u.sort.needs_finalization);
+	assert(kds_final->nitems >= nr_threads &&
+		   kds_final->nitems <= nrooms);
 	/* save the GPU-Task specific read-only properties */
 	if (get_local_id() == 0)
 	{
@@ -368,49 +794,18 @@ kern_gpusort_finalize_buffer(kern_session_info *session,
 	}
 	/* setup execution context */
 	INIT_KERNEL_CONTEXT(kcxt, session, NULL);
-	for (index=get_global_id();
-		 index < kds_final->nitems;
-		 index += get_global_size())
+	for (index=get_global_id(); index < nrooms; index += get_global_size())
 	{
-		kern_tupitem   *titem = KDS_GET_TUPITEM(kds_final, index);
-
-		for (int k=0; k < sort_kexp->u.sort.nkeys; k++)
+		if (index < kds_final->nitems)
 		{
-			const kern_sortkey_desc *sdesc = &sort_kexp->u.sort.desc[k];
-
-			switch (sdesc->kind)
-			{
-				case KSORT_KEY_KIND__PAVG_INT64:
-					__gpusort_finalize_pavg_int64(kcxt, kds_final, titem, sdesc);
-					break;
-				case KSORT_KEY_KIND__PAVG_FP64:
-					__gpusort_finalize_pavg_fp64(kcxt, kds_final, titem, sdesc);
-					break;
-				case KSORT_KEY_KIND__PAVG_NUMERIC:
-					__gpusort_finalize_pavg_numeric(kcxt, kds_final, titem, sdesc);
-					break;
-				case KSORT_KEY_KIND__PVARIANCE_SAMP:
-				case KSORT_KEY_KIND__PVARIANCE_POP:
-					__gpusort_finalize_pvariance(kcxt, kds_final, titem, sdesc);
-					break;
-				case KSORT_KEY_KIND__PCOVAR_CORR:
-				case KSORT_KEY_KIND__PCOVAR_SAMP:
-				case KSORT_KEY_KIND__PCOVAR_POP:
-				case KSORT_KEY_KIND__PCOVAR_AVGX:
-				case KSORT_KEY_KIND__PCOVAR_AVGY:
-				case KSORT_KEY_KIND__PCOVAR_COUNT:
-				case KSORT_KEY_KIND__PCOVAR_INTERCEPT:
-				case KSORT_KEY_KIND__PCOVAR_REGR_R2:
-				case KSORT_KEY_KIND__PCOVAR_REGR_SLOPE:
-				case KSORT_KEY_KIND__PCOVAR_REGR_SXX:
-				case KSORT_KEY_KIND__PCOVAR_REGR_SXY:
-				case KSORT_KEY_KIND__PCOVAR_REGR_SYY:
-					__gpusort_finalize_pcovariance(kcxt, kds_final, titem, sdesc);
-					break;
-				default:
-					/* nothing to do */
-					break;
-			}
+			if (sort_kexp->u.sort.needs_finalization)
+				__gpusort_prep_tupitem(kcxt, sort_kexp, kds_final, index);
+			if (row_index)
+				row_index[index] = KDS_GET_ROWINDEX(kds_final)[index];
 		}
+		else if (row_index)
+			row_index[index] = NULL;
+		else
+			KDS_GET_ROWINDEX(kds_final)[index] = NULL;
 	}
 }
diff --git a/src/gpu_join.c b/src/gpu_join.c
index fbd1ff23..b5c0f31f 100644
--- a/src/gpu_join.c
+++ b/src/gpu_join.c
@@ -960,8 +960,8 @@ try_add_sorted_gpujoin_path(PlannerInfo *root,
 
 				if (pk->pk_nulls_first)
 					kind |= KSORT_KEY_ATTR__NULLS_FIRST;
-				if (pk->pk_strategy == BTGreaterStrategyNumber)
-					kind |= KSORT_KEY_ATTR__DESC_ORDER;
+				if (pk->pk_strategy == BTLessStrategyNumber)
+					kind |= KSORT_KEY_ATTR__ORDER_ASC;
 				else if (pk->pk_strategy != BTLessStrategyNumber)
 					return;		/* should not happen */
 				sortkeys_expr = lappend(sortkeys_expr, f_expr);
diff --git a/src/gpu_preagg.c b/src/gpu_preagg.c
index deb15856..d8e3923a 100644
--- a/src/gpu_preagg.c
+++ b/src/gpu_preagg.c
@@ -2031,8 +2031,8 @@ consider_sorted_groupby_path(PlannerInfo *root,
 				}
 				if (pk->pk_nulls_first)
 					kind |= KSORT_KEY_ATTR__NULLS_FIRST;
-				if (pk->pk_strategy == BTGreaterStrategyNumber)
-					kind |= KSORT_KEY_ATTR__DESC_ORDER;
+				if (pk->pk_strategy == BTLessStrategyNumber)
+					kind |= KSORT_KEY_ATTR__ORDER_ASC;
 				else if (pk->pk_strategy != BTLessStrategyNumber)
 					return false;	/* should not happen */
 
diff --git a/src/gpu_service.c b/src/gpu_service.c
index 84ac5cfc..2f74219e 100644
--- a/src/gpu_service.c
+++ b/src/gpu_service.c
@@ -42,7 +42,7 @@ struct gpuContext
 	CUfunction		cufn_prep_gistindex;
 	CUfunction		cufn_merge_outer_join_map;
 	CUfunction		cufn_merge_gpupreagg_buffer;
-	CUfunction		cufn_gpusort_finalize_buffer;
+	CUfunction		cufn_gpusort_prep_buffer;
 	CUfunction		cufn_gpusort_exec_bitonic;
 	CUfunction		cufn_kbuf_partitioning;
 	CUfunction		cufn_kbuf_reconstruction;
@@ -4325,15 +4325,20 @@ gpuservSortingFinalBuffer(gpuClient *gclient,
 	kern_session_info *session = gclient->h_session;
 	kern_expression *kexp_gpusort = SESSION_KEXP_GPUSORT_KEYDESC(session);
 	gpuContext	   *old_gcontext = NULL;
+	uint32_t		nr_threads;
+	int				max_scale;
+	int				scale, step;
 	int				grid_sz, block_sz;
+	size_t			required;
 	gpuMemChunk	   *t_chunk = NULL;
 	kern_gputask   *kgtask;
 	CUdeviceptr		m_session;
-	void		   *kern_args[4];
+	CUdeviceptr		m_rowindex = 0UL;
+	void		   *kern_args[10];
 	CUresult		rc;
 	bool			retval = false;
 
-	if (!kexp_gpusort || !kexp_gpusort->u.sort.needs_finalization)
+	if (!kexp_gpusort || kds_final->nitems <= 1)
 		return true;	/* nothing to do */
 
 	if (gcontext != GpuWorkerCurrentContext)
@@ -4341,19 +4346,28 @@ gpuservSortingFinalBuffer(gpuClient *gclient,
 
 	rc = gpuOptimalBlockSize(&grid_sz,
 							 &block_sz,
-							 gcontext->cufn_gpusort_finalize_buffer, 0);
+							 gcontext->cufn_gpusort_prep_buffer, 0);
 	if (rc != CUDA_SUCCESS)
 	{
 		gpuClientELog(gclient, "failed on gpuOptimalBlockSize: %s",
 					  cuStrError(rc));
 		goto bailout;
 	}
-	grid_sz = Min(grid_sz, (kds_final->nitems + block_sz - 1) / block_sz);
+	max_scale = get_next_log2(kds_final->nitems) - 1;
+	nr_threads = (1UL << max_scale);
+	fprintf(stderr, "max_scale=%d nr_threads=%d\n", max_scale, nr_threads);
+	grid_sz = Min(grid_sz, (nr_threads + block_sz - 1) / block_sz);
 
 	/*
 	 * Setup the control structure
 	 */
-	t_chunk = gpuMemAllocManaged(sizeof(kern_gputask));
+	required = MAXALIGN(sizeof(kern_gputask));
+	if (!__KDS_CHECK_OVERFLOW(kds_final, 2 * nr_threads, kds_final->usage))
+	{
+		required += sizeof(uint64_t) * 2 * nr_threads;
+		m_rowindex = -1L;
+	}
+	t_chunk = gpuMemAllocManaged(required);
 	if (!t_chunk)
 	{
 		gpuClientFatal(gclient, "failed on gpuMemAllocManaged: %lu",
@@ -4366,13 +4380,19 @@ gpuservSortingFinalBuffer(gpuClient *gclient,
 	kgtask->block_sz     = block_sz;
 	kgtask->cuda_dindex  = MY_DINDEX_PER_THREAD;
 	kgtask->cuda_stack_limit = GpuWorkerCurrentContext->cuda_stack_limit;
+	if (m_rowindex != 0UL)
+		m_rowindex = t_chunk->m_devptr + MAXALIGN(sizeof(kern_gputask));
 
 	m_session = gclient->__session[MY_DINDEX_PER_THREAD]->m_devptr;
 	kern_args[0] = &m_session;
 	kern_args[1] = &kgtask;
 	kern_args[2] = &kds_final;
+	kern_args[3] = &nr_threads;
+	kern_args[4] = &m_rowindex;
+	kern_args[5] = &scale;
+	kern_args[6] = &step;
 
-	rc = cuLaunchKernel(gcontext->cufn_gpusort_finalize_buffer,
+	rc = cuLaunchKernel(gcontext->cufn_gpusort_prep_buffer,
 						grid_sz, 1, 1,
 						block_sz, 1, 1,
 						0,
@@ -4385,6 +4405,27 @@ gpuservSortingFinalBuffer(gpuClient *gclient,
 					  grid_sz, block_sz, cuStrError(rc));
 		goto bailout;
 	}
+	/* runs bitonic sorting */
+	for (scale=0; scale <= max_scale; scale++)
+	{
+		for (step=scale; step >= 0; step--)
+		{
+			rc = cuLaunchKernel(gcontext->cufn_gpusort_exec_bitonic,
+								grid_sz, 1, 1,
+								block_sz, 1, 1,
+								0,
+								MY_STREAM_PER_THREAD,
+								kern_args,
+								NULL);
+			if (rc != CUDA_SUCCESS)
+			{
+				gpuClientELog(gclient, "failed on cuLaunchKernel(grid_sz=%d, block_sz=%d): %s",
+							  grid_sz, block_sz, cuStrError(rc));
+				goto bailout;
+			}
+		}
+	}
+	/* wait for completion */
 	rc = cuStreamSynchronize(MY_STREAM_PER_THREAD);
 	if (rc != CUDA_SUCCESS)
 	{
@@ -4392,7 +4433,13 @@ gpuservSortingFinalBuffer(gpuClient *gclient,
 					  cuStrError(rc));
 		goto bailout;
 	}
-	//TODO: Expand kds_final_dst if overflow
+	/* write back the row-index if separated */
+	if (m_rowindex != 0UL)
+	{
+		memcpy(KDS_GET_ROWINDEX(kds_final),
+			   (void *)m_rowindex,
+			   sizeof(uint64_t) * kds_final->nitems);
+	}
 	retval = true;
 bailout:
 	if (old_gcontext)
@@ -4967,12 +5014,12 @@ __setupGpuKernelFunctionsAndParams(gpuContext *gcontext)
 			 func_name, cuStrError(rc));
 	gcontext->cufn_merge_gpupreagg_buffer = cuda_function;
 	/* ------ kern_gpusort_finalize_buffer ------ */
-	func_name = "kern_gpusort_finalize_buffer";
+	func_name = "kern_gpusort_prep_buffer";
 	rc = cuModuleGetFunction(&cuda_function, cuda_module, func_name);
 	if (rc != CUDA_SUCCESS)
 		elog(ERROR, "failed on cuModuleGetFunction('%s'): %s",
 			 func_name, cuStrError(rc));
-	gcontext->cufn_gpusort_finalize_buffer = cuda_function;
+	gcontext->cufn_gpusort_prep_buffer = cuda_function;
 	/* ------ kern_gpusort_exec_bitonic ------ */
 	func_name = "kern_gpusort_exec_bitonic";
 	rc = cuModuleGetFunction(&cuda_function, cuda_module, func_name);
diff --git a/src/xpu_common.h b/src/xpu_common.h
index 06f4813e..92a3425d 100644
--- a/src/xpu_common.h
+++ b/src/xpu_common.h
@@ -2285,7 +2285,7 @@ typedef struct
 
 
 #define KSORT_KEY_ATTR__NULLS_FIRST			0x0400U
-#define KSORT_KEY_ATTR__DESC_ORDER			0x8000U
+#define KSORT_KEY_ATTR__ORDER_ASC			0x8000U
 #define KSORT_KEY_KIND__MASK				0x03ffU
 #define KSORT_KEY_KIND__SHIFT				16
 #define KSORT_KEY_KIND__VREF				0
@@ -2317,7 +2317,7 @@ typedef struct
 {
 	uint16_t	kind;			/* any of KSORT_KEY_KIND__* */
 	int8_t		nulls_first;	/* true, if NULLs first */
-	int8_t		desc_order;		/* true, if smaller is first */
+	int8_t		order_asc;		/* true, if ORDER ASC */
 	uint16_t	src_anum;		/* source attribute number of KDS */
 	uint16_t	buf_offset;		/* if not KSORT_KEY_KIND__VREF, it means offset of
 								 * the temporary calculated sorting key.