From d84e8f3107d7ab9fb94441f372562955cdf9b963 Mon Sep 17 00:00:00 2001 From: KaiGai Kohei Date: Fri, 7 Feb 2025 00:44:16 +0900 Subject: [PATCH] gpusort: initial workable GpuSort on top of GpuPreAgg --- src/codegen.c | 8 +- src/cuda_gpusort.cu | 543 ++++++++++++++++++++++++++++++++++++++------ src/gpu_join.c | 4 +- src/gpu_preagg.c | 4 +- src/gpu_service.c | 67 +++++- src/xpu_common.h | 4 +- 6 files changed, 536 insertions(+), 94 deletions(-) diff --git a/src/codegen.c b/src/codegen.c index 9a3561a0..5db00f52 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -4035,7 +4035,7 @@ codegen_build_gpusort_keydesc(codegen_context *context, keydesc->kind = kind; keydesc->nulls_first = ((ival & KSORT_KEY_ATTR__NULLS_FIRST) != 0); - keydesc->desc_order = ((ival & KSORT_KEY_ATTR__DESC_ORDER) != 0); + keydesc->order_asc = ((ival & KSORT_KEY_ATTR__ORDER_ASC) != 0); if (kind == KSORT_KEY_KIND__VREF) { ListCell *cell; @@ -4673,9 +4673,9 @@ __xpucode_sortkeys_cstring(StringInfo buf, else if (desc->src_anum != 0) appendStringInfo(buf, "; key=(out of range)"); } - appendStringInfo(buf, "; nulls %s; %s order>", - desc->nulls_first ? "first" : "last", - desc->desc_order ? "desc" : "asc"); + appendStringInfo(buf, "[%s;%s]", + desc->nulls_first ? "NF" : "NL", + desc->order_asc ? "ASC" : "DESC"); } } diff --git a/src/cuda_gpusort.cu b/src/cuda_gpusort.cu index bee0f4e5..e2a55514 100644 --- a/src/cuda_gpusort.cu +++ b/src/cuda_gpusort.cu @@ -12,6 +12,329 @@ #include "cuda_common.h" #include "float2.h" +INLINE_FUNCTION(int) +__gpusort_comp_rawkey(kern_context *kcxt, + const kern_sortkey_desc *sdesc, + const kern_data_store *kds_final, + const kern_tupitem *titem_x, + const kern_tupitem *titem_y) +{ + const void *addr_x = kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum); + const void *addr_y = kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum); + + if (addr_x && addr_y) + { + const xpu_datum_operators *key_ops = sdesc->key_ops; + xpu_datum_t *datum_x; + xpu_datum_t *datum_y; + int sz, comp; + + /* + * !!!workaround for a bug!!! + * + * A couple of identical alloca() calls below were unintentionally + * optimized by the compiler. + * Probably, compiler considered that alloca() will return same + * value for the identical argument, thus datum_x and datum_y will + * have same value. + * + * datum_x = (xpu_datum_t *)alloca(key_ops->xpu_type_sizeof); + * datum_y = (xpu_datum_t *)alloca(key_ops->xpu_type_sizeof); + * + * If alloca() would be an immutable function, it is a right assumption, + * however, alloca() modified the current stack frame and allocates + * a temporary buffer. So, datum_x and datum_y should be different + * pointers. + */ + sz = TYPEALIGN(16, key_ops->xpu_type_sizeof); + datum_x = (xpu_datum_t *)alloca(2 * sz); + datum_y = (xpu_datum_t *)((char *)datum_x + sz); + if (key_ops->xpu_datum_heap_read(kcxt, addr_x, datum_x) && + key_ops->xpu_datum_heap_read(kcxt, addr_y, datum_y) && + key_ops->xpu_datum_comp(kcxt, &comp, datum_x, datum_y)) + return sdesc->order_asc ? comp : -comp; + } + else if (addr_x && !addr_y) + return (sdesc->nulls_first ? 1 : -1); /* X is NOT NULL, Y is NULL */ + else if (!addr_x && addr_y) + return (sdesc->nulls_first ? -1 : 1); /* X is NULL, Y is NOT NULL */ + return 0; +} + +INLINE_FUNCTION(int) +__gpusort_comp_pminmax_int64(kern_context *kcxt, + const kern_sortkey_desc *sdesc, + const kern_data_store *kds_final, + const kern_tupitem *titem_x, + const kern_tupitem *titem_y) +{ + const kagg_state__pminmax_int64_packed *x = (const kagg_state__pminmax_int64_packed *) + kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum); + const kagg_state__pminmax_int64_packed *y = (const kagg_state__pminmax_int64_packed *) + kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum); + if (x && (x->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0) + { + if (y && (y->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0) + { + if (x->value < y->value) + return (sdesc->order_asc ? -1 : 1); + if (x->value > y->value) + return (sdesc->order_asc ? 1 : -1); + return 0; + } + else + return (sdesc->nulls_first ? 1 : -1); /* X is NOT NULL, Y is NULL */ + } + else if (y && (y->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0) + return (sdesc->nulls_first ? -1 : 1); /* X is NULL, Y is NOT NULL */ + return 0; +} + +INLINE_FUNCTION(int) +__gpusort_comp_pminmax_fp64(kern_context *kcxt, + const kern_sortkey_desc *sdesc, + const kern_data_store *kds_final, + const kern_tupitem *titem_x, + const kern_tupitem *titem_y) +{ + const kagg_state__pminmax_fp64_packed *x = (const kagg_state__pminmax_fp64_packed *) + kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum); + const kagg_state__pminmax_fp64_packed *y = (const kagg_state__pminmax_fp64_packed *) + kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum); + if (x && (x->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0) + { + if (y && (y->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0) + { + if (x->value < y->value) + return (sdesc->order_asc ? -1 : 1); + if (x->value > y->value) + return (sdesc->order_asc ? 1 : -1); + return 0; + } + else + return (sdesc->nulls_first ? 1 : -1); /* X is NOT NULL, Y is NULL */ + } + else if (y && (y->attrs & __PAGG_MINMAX_ATTRS__VALID) != 0) + return (sdesc->nulls_first ? -1 : 1); /* X is NULL, Y is NOT NULL */ + return 0; +} + +INLINE_FUNCTION(int) +__gpusort_comp_psum_int64(kern_context *kcxt, + const kern_sortkey_desc *sdesc, + const kern_data_store *kds_final, + const kern_tupitem *titem_x, + const kern_tupitem *titem_y) +{ + const kagg_state__psum_int_packed *x = (const kagg_state__psum_int_packed *) + kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum); + const kagg_state__psum_int_packed *y = (const kagg_state__psum_int_packed *) + kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum); + if (x && x->nitems > 0) + { + if (y && y->nitems > 0) + { + if (x->sum < y->sum) + return (sdesc->order_asc ? -1 : 1); + if (x->sum > y->sum) + return (sdesc->order_asc ? 1 : -1); + return 0; + } + else + return (sdesc->nulls_first ? 1 : -1); /* X!=NULL and Y==NULL */ + } + else if (y && y->nitems > 0) + return (sdesc->nulls_first ? -1 : 1); /* X==NULL and Y!=NULL */ + return 0; +} + +INLINE_FUNCTION(int) +__gpusort_comp_psum_fp64(kern_context *kcxt, + const kern_sortkey_desc *sdesc, + const kern_data_store *kds_final, + const kern_tupitem *titem_x, + const kern_tupitem *titem_y) +{ + const kagg_state__psum_fp_packed *x = (const kagg_state__psum_fp_packed *) + kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum); + const kagg_state__psum_fp_packed *y = (const kagg_state__psum_fp_packed *) + kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum); + if (x && x->nitems > 0) + { + if (y && y->nitems > 0) + { + if (x->sum < y->sum) + return (sdesc->order_asc ? -1 : 1); + if (x->sum > y->sum) + return (sdesc->order_asc ? 1 : -1); + return 0; + } + else + return (sdesc->nulls_first ? 1 : -1); /* X!=NULL and Y==NULL */ + } + else if (y && y->nitems > 0) + return (sdesc->nulls_first ? -1 : 1); /* X==NULL and Y!=NULL */ + return 0; +} + +INLINE_FUNCTION(int) +__gpusort_comp_psum_numeric(kern_context *kcxt, + const kern_sortkey_desc *sdesc, + const kern_data_store *kds_final, + const kern_tupitem *titem_x, + const kern_tupitem *titem_y) +{ + const kagg_state__psum_numeric_packed *x = (const kagg_state__psum_numeric_packed *) + kern_fetch_heaptuple_attr(kcxt, kds_final, titem_x, sdesc->src_anum); + const kagg_state__psum_numeric_packed *y = (const kagg_state__psum_numeric_packed *) + kern_fetch_heaptuple_attr(kcxt, kds_final, titem_y, sdesc->src_anum); + if (x && x->nitems > 0) + { + if (y && y->nitems > 0) + { + xpu_numeric_t x_datum; + xpu_numeric_t y_datum; + int xspecial = (x->attrs & __PAGG_NUMERIC_ATTRS__MASK); + int yspecial = (y->attrs & __PAGG_NUMERIC_ATTRS__MASK); + int comp; + + if (xspecial == 0) + { + x_datum.kind = XPU_NUMERIC_KIND__VALID; + x_datum.weight = (int16_t)(x->attrs & __PAGG_NUMERIC_ATTRS__WEIGHT); + x_datum.u.value = __fetch_int128_packed(&x->sum); + } + else if (xspecial == __PAGG_NUMERIC_ATTRS__PINF) + x_datum.kind = XPU_NUMERIC_KIND__POS_INF; + else if (xspecial == __PAGG_NUMERIC_ATTRS__NINF) + x_datum.kind = XPU_NUMERIC_KIND__NEG_INF; + else + x_datum.kind = XPU_NUMERIC_KIND__NAN; + x_datum.expr_ops = &xpu_numeric_ops; + + if (yspecial == 0) + { + y_datum.kind = XPU_NUMERIC_KIND__VALID; + y_datum.weight = (int16_t)(y->attrs & __PAGG_NUMERIC_ATTRS__WEIGHT); + y_datum.u.value = __fetch_int128_packed(&y->sum); + } + else if (yspecial == __PAGG_NUMERIC_ATTRS__PINF) + y_datum.kind = XPU_NUMERIC_KIND__POS_INF; + else if (yspecial == __PAGG_NUMERIC_ATTRS__NINF) + y_datum.kind = XPU_NUMERIC_KIND__NEG_INF; + else + y_datum.kind = XPU_NUMERIC_KIND__NAN; + y_datum.expr_ops = &xpu_numeric_ops; + + sdesc->key_ops->xpu_datum_comp(kcxt, + &comp, + (xpu_datum_t *)&x_datum, + (xpu_datum_t *)&y_datum); + return (sdesc->order_asc ? comp : -comp); + } + else + return (sdesc->nulls_first ? 1 : -1) /* X!=NULL, Y==NULL */; + } + else if (y && y->nitems > 0) + return (sdesc->nulls_first ? -1 : 1); /* X==NULL, Y!=NULL */ + return 0; +} + +INLINE_FUNCTION(int) +__gpusort_comp_precomp_fp64(const kern_sortkey_desc *sdesc, + const kern_tupitem *titem_x, + const kern_tupitem *titem_y) +{ + const char *addr_x = ((char *)&titem_x->htup + titem_x->t_len + sdesc->buf_offset); + const char *addr_y = ((char *)&titem_y->htup + titem_y->t_len + sdesc->buf_offset); + bool notnull_x = *addr_x++; + bool notnull_y = *addr_y++; + float8_t fval_x; + float8_t fval_y; + + if (notnull_x && notnull_y) + { + memcpy(&fval_x, addr_x, sizeof(float8_t)); + memcpy(&fval_y, addr_y, sizeof(float8_t)); + if (fval_x < fval_y) + return (sdesc->order_asc ? -1 : 1); + if (fval_x > fval_y) + return (sdesc->order_asc ? 1 : -1); + } + else if (notnull_x && !notnull_y) + return (sdesc->nulls_first ? 1 : -1); + else if (!notnull_x && notnull_y) + return (sdesc->nulls_first ? -1 : 1); + return 0; +} + +INLINE_FUNCTION(int) +__gpusort_comp_keys(kern_context *kcxt, + const kern_expression *sort_kexp, + const kern_data_store *kds_final, + const kern_tupitem *titem_x, + const kern_tupitem *titem_y) +{ + if (!titem_x) + return (!titem_y ? 0 : 1); + else if (!titem_y) + return -1; + + for (int k=0; k < sort_kexp->u.sort.nkeys; k++) + { + const kern_sortkey_desc *sdesc = &sort_kexp->u.sort.desc[k]; + int comp; + + switch (sdesc->kind) + { + case KSORT_KEY_KIND__VREF: + comp = __gpusort_comp_rawkey(kcxt, sdesc, kds_final, titem_x, titem_y); + break; + case KSORT_KEY_KIND__PMINMAX_INT64: + comp = __gpusort_comp_pminmax_int64(kcxt, sdesc, kds_final, titem_x, titem_y); + break; + case KSORT_KEY_KIND__PMINMAX_FP64: + comp = __gpusort_comp_pminmax_fp64(kcxt, sdesc, kds_final, titem_x, titem_y); + break; + case KSORT_KEY_KIND__PSUM_INT64: + comp = __gpusort_comp_psum_int64(kcxt, sdesc, kds_final, titem_x, titem_y); + break; + case KSORT_KEY_KIND__PSUM_FP64: + comp = __gpusort_comp_psum_fp64(kcxt, sdesc, kds_final, titem_x, titem_y); + break; + case KSORT_KEY_KIND__PSUM_NUMERIC: + comp = __gpusort_comp_psum_numeric(kcxt, sdesc, kds_final, titem_x, titem_y); + break; + case KSORT_KEY_KIND__PAVG_INT64: + case KSORT_KEY_KIND__PAVG_FP64: + case KSORT_KEY_KIND__PAVG_NUMERIC: + case KSORT_KEY_KIND__PVARIANCE_SAMP: + case KSORT_KEY_KIND__PVARIANCE_POP: + case KSORT_KEY_KIND__PCOVAR_CORR: + case KSORT_KEY_KIND__PCOVAR_SAMP: + case KSORT_KEY_KIND__PCOVAR_POP: + case KSORT_KEY_KIND__PCOVAR_AVGX: + case KSORT_KEY_KIND__PCOVAR_AVGY: + case KSORT_KEY_KIND__PCOVAR_COUNT: + case KSORT_KEY_KIND__PCOVAR_INTERCEPT: + case KSORT_KEY_KIND__PCOVAR_REGR_R2: + case KSORT_KEY_KIND__PCOVAR_REGR_SLOPE: + case KSORT_KEY_KIND__PCOVAR_REGR_SXX: + case KSORT_KEY_KIND__PCOVAR_REGR_SXY: + case KSORT_KEY_KIND__PCOVAR_REGR_SYY: + /* pre-computed float8 values */ + comp = __gpusort_comp_precomp_fp64(sdesc, titem_x, titem_y); + break; + default: + /* Bug? should not happen */ + comp = 0; + break; + } + if (comp != 0) + return comp; + } + return 0; +} /* * kern_gpusort_exec_bitonic @@ -20,16 +343,64 @@ KERNEL_FUNCTION(void) kern_gpusort_exec_bitonic(kern_session_info *session, kern_gputask *kgtask, kern_data_store *kds_final, - int step) + uint32_t nr_threads, + uint64_t *row_index, + int scale, int step) { + const kern_expression *sort_kexp = SESSION_KEXP_GPUSORT_KEYDESC(session); + const char *end = (const char *)kds_final + kds_final->length; + kern_context *kcxt; + uint32_t thread_id; + + /* sanity checks */ + assert(get_local_size() <= CUDA_MAXTHREADS_PER_BLOCK); + assert((nr_threads & (nr_threads-1)) == 0); + /* save the GPU-Task specific read-only properties */ + if (get_local_id() == 0) + { + stromTaskProp__cuda_dindex = kgtask->cuda_dindex; + stromTaskProp__cuda_stack_limit = kgtask->cuda_stack_limit; + stromTaskProp__partition_divisor = kgtask->partition_divisor; + stromTaskProp__partition_reminder = kgtask->partition_reminder; + } + /* setup execution context */ + INIT_KERNEL_CONTEXT(kcxt, session, NULL); + if (!row_index) + row_index = KDS_GET_ROWINDEX(kds_final); + for (thread_id = get_global_id(); + thread_id < nr_threads; + thread_id += get_global_size()) + { + uint32_t base = ((thread_id >> scale) << (scale+1)); + uint32_t m_bits = (thread_id & ((1U<> step; + uint32_t l_bits = (thread_id & ((1U< 0) + { + uint64_t temp = row_index[index]; + row_index[index] = row_index[buddy]; + row_index[buddy] = temp; + } + } } -STATIC_FUNCTION(void) -__gpusort_finalize_pavg_int64(kern_context *kcxt, - kern_data_store *kds_final, - kern_tupitem *titem, - const kern_sortkey_desc *sdesc) +INLINE_FUNCTION(void) +__gpusort_prep_pavg_int64(kern_context *kcxt, + const kern_data_store *kds_final, + const kern_tupitem *titem, + const kern_sortkey_desc *sdesc) { const void *addr; char *dest; @@ -55,11 +426,11 @@ __gpusort_finalize_pavg_int64(kern_context *kcxt, } } -STATIC_FUNCTION(void) -__gpusort_finalize_pavg_fp64(kern_context *kcxt, - kern_data_store *kds_final, - kern_tupitem *titem, - const kern_sortkey_desc *sdesc) +INLINE_FUNCTION(void) +__gpusort_prep_pavg_fp64(kern_context *kcxt, + const kern_data_store *kds_final, + const kern_tupitem *titem, + const kern_sortkey_desc *sdesc) { const void *addr; char *dest; @@ -85,11 +456,11 @@ __gpusort_finalize_pavg_fp64(kern_context *kcxt, } } -STATIC_FUNCTION(void) -__gpusort_finalize_pavg_numeric(kern_context *kcxt, - kern_data_store *kds_final, - kern_tupitem *titem, - const kern_sortkey_desc *sdesc) +INLINE_FUNCTION(void) +__gpusort_prep_pavg_numeric(kern_context *kcxt, + const kern_data_store *kds_final, + const kern_tupitem *titem, + const kern_sortkey_desc *sdesc) { const void *addr; char *dest; @@ -111,6 +482,7 @@ __gpusort_finalize_pavg_numeric(kern_context *kcxt, int16_t weight = (r->attrs & __PAGG_NUMERIC_ATTRS__WEIGHT); float8_t fval; + printf("not_null=%p\n", dest); *dest++ = true; if (special == 0) { @@ -162,18 +534,17 @@ __gpusort_finalize_pavg_numeric(kern_context *kcxt, fval = -INFINITY; else fval = NAN; - printf("fval = %f\n", fval); + printf("fval = %f buf_offset=%d\n", fval, sdesc->buf_offset); memcpy(dest, &fval, sizeof(float8_t)); } } } - -STATIC_FUNCTION(void) -__gpusort_finalize_pvariance(kern_context *kcxt, - kern_data_store *kds_final, - kern_tupitem *titem, - const kern_sortkey_desc *sdesc) +INLINE_FUNCTION(void) +__gpusort_prep_pvariance(kern_context *kcxt, + const kern_data_store *kds_final, + const kern_tupitem *titem, + const kern_sortkey_desc *sdesc) { const void *addr; char *dest; @@ -228,11 +599,11 @@ __gpusort_finalize_pvariance(kern_context *kcxt, } } -STATIC_FUNCTION(void) -__gpusort_finalize_pcovariance(kern_context *kcxt, - kern_data_store *kds_final, - kern_tupitem *titem, - const kern_sortkey_desc *sdesc) +INLINE_FUNCTION(void) +__gpusort_prep_pcovariance(kern_context *kcxt, + const kern_data_store *kds_final, + const kern_tupitem *titem, + const kern_sortkey_desc *sdesc) { const void *addr; char *dest; @@ -344,20 +715,75 @@ __gpusort_finalize_pcovariance(kern_context *kcxt, } /* - * kern_gpusort_finalize_buffer + * per-tuple preparation on demand + */ +INLINE_FUNCTION(void) +__gpusort_prep_tupitem(kern_context *kcxt, + const kern_expression *sort_kexp, + const kern_data_store *kds_final, + uint32_t kds_index) +{ + const kern_tupitem *titem = KDS_GET_TUPITEM(kds_final, kds_index); + + for (int k=0; k < sort_kexp->u.sort.nkeys; k++) + { + const kern_sortkey_desc *sdesc = &sort_kexp->u.sort.desc[k]; + + switch (sdesc->kind) + { + case KSORT_KEY_KIND__PAVG_INT64: + __gpusort_prep_pavg_int64(kcxt, kds_final, titem, sdesc); + break; + case KSORT_KEY_KIND__PAVG_FP64: + __gpusort_prep_pavg_fp64(kcxt, kds_final, titem, sdesc); + break; + case KSORT_KEY_KIND__PAVG_NUMERIC: + __gpusort_prep_pavg_numeric(kcxt, kds_final, titem, sdesc); + break; + case KSORT_KEY_KIND__PVARIANCE_SAMP: + case KSORT_KEY_KIND__PVARIANCE_POP: + __gpusort_prep_pvariance(kcxt, kds_final, titem, sdesc); + break; + case KSORT_KEY_KIND__PCOVAR_CORR: + case KSORT_KEY_KIND__PCOVAR_SAMP: + case KSORT_KEY_KIND__PCOVAR_POP: + case KSORT_KEY_KIND__PCOVAR_AVGX: + case KSORT_KEY_KIND__PCOVAR_AVGY: + case KSORT_KEY_KIND__PCOVAR_COUNT: + case KSORT_KEY_KIND__PCOVAR_INTERCEPT: + case KSORT_KEY_KIND__PCOVAR_REGR_R2: + case KSORT_KEY_KIND__PCOVAR_REGR_SLOPE: + case KSORT_KEY_KIND__PCOVAR_REGR_SXX: + case KSORT_KEY_KIND__PCOVAR_REGR_SXY: + case KSORT_KEY_KIND__PCOVAR_REGR_SYY: + __gpusort_prep_pcovariance(kcxt, kds_final, titem, sdesc); + break; + default: + /* nothing to do */ + break; + } + } +} + +/* + * kern_gpusort_prep_buffer */ KERNEL_FUNCTION(void) -kern_gpusort_finalize_buffer(kern_session_info *session, - kern_gputask *kgtask, - kern_data_store *kds_final) +kern_gpusort_prep_buffer(kern_session_info *session, + kern_gputask *kgtask, + kern_data_store *kds_final, + uint32_t nr_threads, + uint64_t *row_index) { const kern_expression *sort_kexp = SESSION_KEXP_GPUSORT_KEYDESC(session); kern_context *kcxt; + uint32_t nrooms = 2 * nr_threads; uint32_t index; /* sanity checks */ assert(get_local_size() <= CUDA_MAXTHREADS_PER_BLOCK); - assert(sort_kexp->u.sort.needs_finalization); + assert(kds_final->nitems >= nr_threads && + kds_final->nitems <= nrooms); /* save the GPU-Task specific read-only properties */ if (get_local_id() == 0) { @@ -368,49 +794,18 @@ kern_gpusort_finalize_buffer(kern_session_info *session, } /* setup execution context */ INIT_KERNEL_CONTEXT(kcxt, session, NULL); - for (index=get_global_id(); - index < kds_final->nitems; - index += get_global_size()) + for (index=get_global_id(); index < nrooms; index += get_global_size()) { - kern_tupitem *titem = KDS_GET_TUPITEM(kds_final, index); - - for (int k=0; k < sort_kexp->u.sort.nkeys; k++) + if (index < kds_final->nitems) { - const kern_sortkey_desc *sdesc = &sort_kexp->u.sort.desc[k]; - - switch (sdesc->kind) - { - case KSORT_KEY_KIND__PAVG_INT64: - __gpusort_finalize_pavg_int64(kcxt, kds_final, titem, sdesc); - break; - case KSORT_KEY_KIND__PAVG_FP64: - __gpusort_finalize_pavg_fp64(kcxt, kds_final, titem, sdesc); - break; - case KSORT_KEY_KIND__PAVG_NUMERIC: - __gpusort_finalize_pavg_numeric(kcxt, kds_final, titem, sdesc); - break; - case KSORT_KEY_KIND__PVARIANCE_SAMP: - case KSORT_KEY_KIND__PVARIANCE_POP: - __gpusort_finalize_pvariance(kcxt, kds_final, titem, sdesc); - break; - case KSORT_KEY_KIND__PCOVAR_CORR: - case KSORT_KEY_KIND__PCOVAR_SAMP: - case KSORT_KEY_KIND__PCOVAR_POP: - case KSORT_KEY_KIND__PCOVAR_AVGX: - case KSORT_KEY_KIND__PCOVAR_AVGY: - case KSORT_KEY_KIND__PCOVAR_COUNT: - case KSORT_KEY_KIND__PCOVAR_INTERCEPT: - case KSORT_KEY_KIND__PCOVAR_REGR_R2: - case KSORT_KEY_KIND__PCOVAR_REGR_SLOPE: - case KSORT_KEY_KIND__PCOVAR_REGR_SXX: - case KSORT_KEY_KIND__PCOVAR_REGR_SXY: - case KSORT_KEY_KIND__PCOVAR_REGR_SYY: - __gpusort_finalize_pcovariance(kcxt, kds_final, titem, sdesc); - break; - default: - /* nothing to do */ - break; - } + if (sort_kexp->u.sort.needs_finalization) + __gpusort_prep_tupitem(kcxt, sort_kexp, kds_final, index); + if (row_index) + row_index[index] = KDS_GET_ROWINDEX(kds_final)[index]; } + else if (row_index) + row_index[index] = NULL; + else + KDS_GET_ROWINDEX(kds_final)[index] = NULL; } } diff --git a/src/gpu_join.c b/src/gpu_join.c index fbd1ff23..b5c0f31f 100644 --- a/src/gpu_join.c +++ b/src/gpu_join.c @@ -960,8 +960,8 @@ try_add_sorted_gpujoin_path(PlannerInfo *root, if (pk->pk_nulls_first) kind |= KSORT_KEY_ATTR__NULLS_FIRST; - if (pk->pk_strategy == BTGreaterStrategyNumber) - kind |= KSORT_KEY_ATTR__DESC_ORDER; + if (pk->pk_strategy == BTLessStrategyNumber) + kind |= KSORT_KEY_ATTR__ORDER_ASC; else if (pk->pk_strategy != BTLessStrategyNumber) return; /* should not happen */ sortkeys_expr = lappend(sortkeys_expr, f_expr); diff --git a/src/gpu_preagg.c b/src/gpu_preagg.c index deb15856..d8e3923a 100644 --- a/src/gpu_preagg.c +++ b/src/gpu_preagg.c @@ -2031,8 +2031,8 @@ consider_sorted_groupby_path(PlannerInfo *root, } if (pk->pk_nulls_first) kind |= KSORT_KEY_ATTR__NULLS_FIRST; - if (pk->pk_strategy == BTGreaterStrategyNumber) - kind |= KSORT_KEY_ATTR__DESC_ORDER; + if (pk->pk_strategy == BTLessStrategyNumber) + kind |= KSORT_KEY_ATTR__ORDER_ASC; else if (pk->pk_strategy != BTLessStrategyNumber) return false; /* should not happen */ diff --git a/src/gpu_service.c b/src/gpu_service.c index 84ac5cfc..2f74219e 100644 --- a/src/gpu_service.c +++ b/src/gpu_service.c @@ -42,7 +42,7 @@ struct gpuContext CUfunction cufn_prep_gistindex; CUfunction cufn_merge_outer_join_map; CUfunction cufn_merge_gpupreagg_buffer; - CUfunction cufn_gpusort_finalize_buffer; + CUfunction cufn_gpusort_prep_buffer; CUfunction cufn_gpusort_exec_bitonic; CUfunction cufn_kbuf_partitioning; CUfunction cufn_kbuf_reconstruction; @@ -4325,15 +4325,20 @@ gpuservSortingFinalBuffer(gpuClient *gclient, kern_session_info *session = gclient->h_session; kern_expression *kexp_gpusort = SESSION_KEXP_GPUSORT_KEYDESC(session); gpuContext *old_gcontext = NULL; + uint32_t nr_threads; + int max_scale; + int scale, step; int grid_sz, block_sz; + size_t required; gpuMemChunk *t_chunk = NULL; kern_gputask *kgtask; CUdeviceptr m_session; - void *kern_args[4]; + CUdeviceptr m_rowindex = 0UL; + void *kern_args[10]; CUresult rc; bool retval = false; - if (!kexp_gpusort || !kexp_gpusort->u.sort.needs_finalization) + if (!kexp_gpusort || kds_final->nitems <= 1) return true; /* nothing to do */ if (gcontext != GpuWorkerCurrentContext) @@ -4341,19 +4346,28 @@ gpuservSortingFinalBuffer(gpuClient *gclient, rc = gpuOptimalBlockSize(&grid_sz, &block_sz, - gcontext->cufn_gpusort_finalize_buffer, 0); + gcontext->cufn_gpusort_prep_buffer, 0); if (rc != CUDA_SUCCESS) { gpuClientELog(gclient, "failed on gpuOptimalBlockSize: %s", cuStrError(rc)); goto bailout; } - grid_sz = Min(grid_sz, (kds_final->nitems + block_sz - 1) / block_sz); + max_scale = get_next_log2(kds_final->nitems) - 1; + nr_threads = (1UL << max_scale); + fprintf(stderr, "max_scale=%d nr_threads=%d\n", max_scale, nr_threads); + grid_sz = Min(grid_sz, (nr_threads + block_sz - 1) / block_sz); /* * Setup the control structure */ - t_chunk = gpuMemAllocManaged(sizeof(kern_gputask)); + required = MAXALIGN(sizeof(kern_gputask)); + if (!__KDS_CHECK_OVERFLOW(kds_final, 2 * nr_threads, kds_final->usage)) + { + required += sizeof(uint64_t) * 2 * nr_threads; + m_rowindex = -1L; + } + t_chunk = gpuMemAllocManaged(required); if (!t_chunk) { gpuClientFatal(gclient, "failed on gpuMemAllocManaged: %lu", @@ -4366,13 +4380,19 @@ gpuservSortingFinalBuffer(gpuClient *gclient, kgtask->block_sz = block_sz; kgtask->cuda_dindex = MY_DINDEX_PER_THREAD; kgtask->cuda_stack_limit = GpuWorkerCurrentContext->cuda_stack_limit; + if (m_rowindex != 0UL) + m_rowindex = t_chunk->m_devptr + MAXALIGN(sizeof(kern_gputask)); m_session = gclient->__session[MY_DINDEX_PER_THREAD]->m_devptr; kern_args[0] = &m_session; kern_args[1] = &kgtask; kern_args[2] = &kds_final; + kern_args[3] = &nr_threads; + kern_args[4] = &m_rowindex; + kern_args[5] = &scale; + kern_args[6] = &step; - rc = cuLaunchKernel(gcontext->cufn_gpusort_finalize_buffer, + rc = cuLaunchKernel(gcontext->cufn_gpusort_prep_buffer, grid_sz, 1, 1, block_sz, 1, 1, 0, @@ -4385,6 +4405,27 @@ gpuservSortingFinalBuffer(gpuClient *gclient, grid_sz, block_sz, cuStrError(rc)); goto bailout; } + /* runs bitonic sorting */ + for (scale=0; scale <= max_scale; scale++) + { + for (step=scale; step >= 0; step--) + { + rc = cuLaunchKernel(gcontext->cufn_gpusort_exec_bitonic, + grid_sz, 1, 1, + block_sz, 1, 1, + 0, + MY_STREAM_PER_THREAD, + kern_args, + NULL); + if (rc != CUDA_SUCCESS) + { + gpuClientELog(gclient, "failed on cuLaunchKernel(grid_sz=%d, block_sz=%d): %s", + grid_sz, block_sz, cuStrError(rc)); + goto bailout; + } + } + } + /* wait for completion */ rc = cuStreamSynchronize(MY_STREAM_PER_THREAD); if (rc != CUDA_SUCCESS) { @@ -4392,7 +4433,13 @@ gpuservSortingFinalBuffer(gpuClient *gclient, cuStrError(rc)); goto bailout; } - //TODO: Expand kds_final_dst if overflow + /* write back the row-index if separated */ + if (m_rowindex != 0UL) + { + memcpy(KDS_GET_ROWINDEX(kds_final), + (void *)m_rowindex, + sizeof(uint64_t) * kds_final->nitems); + } retval = true; bailout: if (old_gcontext) @@ -4967,12 +5014,12 @@ __setupGpuKernelFunctionsAndParams(gpuContext *gcontext) func_name, cuStrError(rc)); gcontext->cufn_merge_gpupreagg_buffer = cuda_function; /* ------ kern_gpusort_finalize_buffer ------ */ - func_name = "kern_gpusort_finalize_buffer"; + func_name = "kern_gpusort_prep_buffer"; rc = cuModuleGetFunction(&cuda_function, cuda_module, func_name); if (rc != CUDA_SUCCESS) elog(ERROR, "failed on cuModuleGetFunction('%s'): %s", func_name, cuStrError(rc)); - gcontext->cufn_gpusort_finalize_buffer = cuda_function; + gcontext->cufn_gpusort_prep_buffer = cuda_function; /* ------ kern_gpusort_exec_bitonic ------ */ func_name = "kern_gpusort_exec_bitonic"; rc = cuModuleGetFunction(&cuda_function, cuda_module, func_name); diff --git a/src/xpu_common.h b/src/xpu_common.h index 06f4813e..92a3425d 100644 --- a/src/xpu_common.h +++ b/src/xpu_common.h @@ -2285,7 +2285,7 @@ typedef struct #define KSORT_KEY_ATTR__NULLS_FIRST 0x0400U -#define KSORT_KEY_ATTR__DESC_ORDER 0x8000U +#define KSORT_KEY_ATTR__ORDER_ASC 0x8000U #define KSORT_KEY_KIND__MASK 0x03ffU #define KSORT_KEY_KIND__SHIFT 16 #define KSORT_KEY_KIND__VREF 0 @@ -2317,7 +2317,7 @@ typedef struct { uint16_t kind; /* any of KSORT_KEY_KIND__* */ int8_t nulls_first; /* true, if NULLs first */ - int8_t desc_order; /* true, if smaller is first */ + int8_t order_asc; /* true, if ORDER ASC */ uint16_t src_anum; /* source attribute number of KDS */ uint16_t buf_offset; /* if not KSORT_KEY_KIND__VREF, it means offset of * the temporary calculated sorting key.