From d3bc4e765862070f2560bd396e8366298b16ae12 Mon Sep 17 00:00:00 2001 From: Hans Petter Jansson Date: Sun, 31 Mar 2024 15:19:40 +0200 Subject: [PATCH] libchafa: Add an AVX2-optimized path for symbol error calculation This gives an overall speedup of 10-20% over SSE 4.1 with -w 9. --- chafa/chafa-canvas.c | 5 ++++ chafa/chafa-symbol-map.c | 54 ++++++++++++++++++++++++++------- chafa/internal/chafa-avx2.c | 55 ++++++++++++++++++++++++++++++++++ chafa/internal/chafa-private.h | 6 ++++ 4 files changed, 109 insertions(+), 11 deletions(-) diff --git a/chafa/chafa-canvas.c b/chafa/chafa-canvas.c index 530e3e4d..d9854596 100644 --- a/chafa/chafa-canvas.c +++ b/chafa/chafa-canvas.c @@ -241,6 +241,11 @@ eval_symbol_error (const ChafaWorkCell *wcell, pair = eval->colors; } +#ifdef HAVE_AVX2_INTRINSICS + if (chafa_have_avx2 ()) + error = calc_error_avx2 (wcell->pixels, &pair, sym->mask_u32); + else +#endif #ifdef HAVE_SSE41_INTRINSICS if (chafa_have_sse41 ()) error = calc_error_sse41 (wcell->pixels, &pair, covp); diff --git a/chafa/chafa-symbol-map.c b/chafa/chafa-symbol-map.c index 1b13222b..433b5a80 100644 --- a/chafa/chafa-symbol-map.c +++ b/chafa/chafa-symbol-map.c @@ -266,6 +266,31 @@ bitmap_to_argb (guint64 bitmap, guint8 *argb, gint rowstride) } } +static gpointer +bitmap_to_argb_alloc (guint64 bitmap) +{ + gpointer argb; + + argb = g_malloc (CHAFA_SYMBOL_N_PIXELS * 4); + bitmap_to_argb (bitmap, argb, CHAFA_SYMBOL_WIDTH_PIXELS * 4); + return argb; +} + +static gpointer +bitmap2_to_argb_alloc (guint64 bitmap_0, guint64 bitmap_1) +{ + gpointer argb; + + argb = g_malloc (CHAFA_SYMBOL_N_PIXELS * 4 * 2); + + bitmap_to_argb (bitmap_0, argb, + CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2); + bitmap_to_argb (bitmap_1, ((guint8 *) argb) + CHAFA_SYMBOL_WIDTH_PIXELS * 4, + CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2); + + return argb; +} + static guint64 glyph_to_bitmap (gint width, gint height, gint rowstride, @@ -363,7 +388,10 @@ compile_symbols (ChafaSymbolMap *symbol_map, GHashTable *desired_symbols) gint i; for (i = 0; i < symbol_map->n_symbols; i++) + { g_free (symbol_map->symbols [i].coverage); + g_free (symbol_map->symbols [i].mask_u32); + } g_free (symbol_map->symbols); g_free (symbol_map->packed_bitmaps); @@ -380,6 +408,7 @@ compile_symbols (ChafaSymbolMap *symbol_map, GHashTable *desired_symbols) symbol_map->symbols [i] = *sym; symbol_map->symbols [i].coverage = g_memdup (symbol_map->symbols [i].coverage, CHAFA_SYMBOL_N_PIXELS); + symbol_map->symbols [i].mask_u32 = bitmap_to_argb_alloc (symbol_map->symbols [i].bitmap); i++; } @@ -404,7 +433,9 @@ compile_symbols_wide (ChafaSymbolMap *symbol_map, GHashTable *desired_symbols) for (i = 0; i < symbol_map->n_symbols2; i++) { g_free (symbol_map->symbols2 [i].sym [0].coverage); + g_free (symbol_map->symbols2 [i].sym [0].mask_u32); g_free (symbol_map->symbols2 [i].sym [1].coverage); + g_free (symbol_map->symbols2 [i].sym [1].mask_u32); } g_free (symbol_map->symbols2); @@ -421,8 +452,10 @@ compile_symbols_wide (ChafaSymbolMap *symbol_map, GHashTable *desired_symbols) symbol_map->symbols2 [i] = *sym; symbol_map->symbols2 [i].sym [0].coverage = g_memdup (symbol_map->symbols2 [i].sym [0].coverage, CHAFA_SYMBOL_N_PIXELS); + symbol_map->symbols2 [i].sym [0].mask_u32 = bitmap_to_argb_alloc (symbol_map->symbols2 [i].sym [0].bitmap); symbol_map->symbols2 [i].sym [1].coverage = g_memdup (symbol_map->symbols2 [i].sym [1].coverage, CHAFA_SYMBOL_N_PIXELS); + symbol_map->symbols2 [i].sym [1].mask_u32 = bitmap_to_argb_alloc (symbol_map->symbols2 [i].sym [1].bitmap); i++; } @@ -507,6 +540,7 @@ free_symbol (gpointer sym_p) ChafaSymbol *sym = sym_p; g_free (sym->coverage); + g_free (sym->mask_u32); g_free (sym); } @@ -516,7 +550,9 @@ free_symbol_wide (gpointer sym_p) ChafaSymbol2 *sym = sym_p; g_free (sym->sym [0].coverage); + g_free (sym->sym [0].mask_u32); g_free (sym->sym [1].coverage); + g_free (sym->sym [1].mask_u32); g_free (sym); } @@ -1028,12 +1064,17 @@ chafa_symbol_map_deinit (ChafaSymbolMap *symbol_map) g_return_if_fail (symbol_map != NULL); for (i = 0; i < symbol_map->n_symbols; i++) + { g_free (symbol_map->symbols [i].coverage); + g_free (symbol_map->symbols [i].mask_u32); + } for (i = 0; i < symbol_map->n_symbols2; i++) { g_free (symbol_map->symbols2 [i].sym [0].coverage); + g_free (symbol_map->symbols2 [i].sym [0].mask_u32); g_free (symbol_map->symbols2 [i].sym [1].coverage); + g_free (symbol_map->symbols2 [i].sym [1].mask_u32); } g_hash_table_destroy (symbol_map->glyphs); @@ -1829,13 +1870,7 @@ chafa_symbol_map_get_glyph (ChafaSymbolMap *symbol_map, g_assert (glyph2->c == code_point); if (pixels_out) - { - *pixels_out = g_malloc (CHAFA_SYMBOL_N_PIXELS * 4 * 2); - bitmap_to_argb (glyph2->bitmap [0], *pixels_out, - CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2); - bitmap_to_argb (glyph2->bitmap [1], ((guint8 *) *pixels_out) + CHAFA_SYMBOL_WIDTH_PIXELS * 4, - CHAFA_SYMBOL_WIDTH_PIXELS * 4 * 2); - } + *pixels_out = bitmap2_to_argb_alloc (glyph2->bitmap [0], glyph2->bitmap [1]); width = CHAFA_SYMBOL_WIDTH_PIXELS * 2; height = CHAFA_SYMBOL_HEIGHT_PIXELS; @@ -1852,10 +1887,7 @@ chafa_symbol_map_get_glyph (ChafaSymbolMap *symbol_map, g_assert (glyph->c == code_point); if (pixels_out) - { - *pixels_out = g_malloc (CHAFA_SYMBOL_N_PIXELS * 4); - bitmap_to_argb (glyph->bitmap, *pixels_out, CHAFA_SYMBOL_WIDTH_PIXELS * 4); - } + *pixels_out = bitmap_to_argb_alloc (glyph->bitmap); width = CHAFA_SYMBOL_WIDTH_PIXELS; height = CHAFA_SYMBOL_HEIGHT_PIXELS; diff --git a/chafa/internal/chafa-avx2.c b/chafa/internal/chafa-avx2.c index 359ba1d3..8d659105 100644 --- a/chafa/internal/chafa-avx2.c +++ b/chafa/internal/chafa-avx2.c @@ -19,7 +19,62 @@ #include "config.h" +#include #include #include "chafa.h" #include "internal/chafa-private.h" +gint +calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair, + const guint32 *sym_mask_u32) +{ + __m256i err_8x_u32 = { 0 }; + const gint32 *e = (gint32 *) &err_8x_u32; + __m128i fg_4x_u32, bg_4x_u32; + __m256i fg_4x_u64, bg_4x_u64; + const __m256i *pixels_8x_p = (const __m256i *) pixels; + const __m256i *sym_mask_8x_p = (const __m256i *) sym_mask_u32; + gint i; + + fg_4x_u32 = _mm_set1_epi32 (CHAFA_COLOR8_U32 (color_pair->colors [CHAFA_COLOR_PAIR_FG])); + fg_4x_u64 = _mm256_cvtepu8_epi16 (fg_4x_u32); + + bg_4x_u32 = _mm_set1_epi32 (CHAFA_COLOR8_U32 (color_pair->colors [CHAFA_COLOR_PAIR_BG])); + bg_4x_u64 = _mm256_cvtepu8_epi16 (bg_4x_u32); + + for (i = 0; i < CHAFA_SYMBOL_N_PIXELS / 8; i++) + { + __m256i pixels_8x, sym_mask_8x; + __m256i p0, m0, fg0, bg0, d0; + __m256i p1, m1, fg1, bg1, d1; + + pixels_8x = _mm256_loadu_si256 (pixels_8x_p); + pixels_8x_p++; + + sym_mask_8x = _mm256_loadu_si256 (sym_mask_8x_p); + sym_mask_8x_p++; + + p0 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (pixels_8x, 0)); + m0 = _mm256_cvtepi8_epi16 (_mm256_extracti128_si256 (sym_mask_8x, 0)); + fg0 = _mm256_and_si256 (m0, _mm256_sub_epi16 (fg_4x_u64, p0)); + bg0 = _mm256_andnot_si256 (m0, _mm256_sub_epi16 (bg_4x_u64, p0)); + d0 = _mm256_or_si256 (fg0, bg0); + d0 = _mm256_mullo_epi16 (d0, d0); + d0 = _mm256_add_epi32 (_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d0, 0)), + _mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d0, 1))); + + p1 = _mm256_cvtepu8_epi16 (_mm256_extracti128_si256 (pixels_8x, 1)); + m1 = _mm256_cvtepi8_epi16 (_mm256_extracti128_si256 (sym_mask_8x, 1)); + fg1 = _mm256_and_si256 (m1, _mm256_sub_epi16 (fg_4x_u64, p1)); + bg1 = _mm256_andnot_si256 (m1, _mm256_sub_epi16 (bg_4x_u64, p1)); + d1 = _mm256_or_si256 (fg1, bg1); + d1 = _mm256_mullo_epi16 (d1, d1); + d1 = _mm256_add_epi32 (_mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d1, 0)), + _mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (d1, 1))); + + err_8x_u32 = _mm256_add_epi32 (err_8x_u32, d0); + err_8x_u32 = _mm256_add_epi32 (err_8x_u32, d1); + } + + return e [0] + e [1] + e [2] + e [4] + e [5] + e [6]; +} diff --git a/chafa/internal/chafa-private.h b/chafa/internal/chafa-private.h index 77c94794..a5c26527 100644 --- a/chafa/internal/chafa-private.h +++ b/chafa/internal/chafa-private.h @@ -42,6 +42,7 @@ typedef struct ChafaSymbolTags sc; gunichar c; gchar *coverage; + guint32 *mask_u32; gint fg_weight, bg_weight; guint64 bitmap; gint popcount; @@ -213,6 +214,11 @@ void chafa_leave_mmx (void); gint calc_error_sse41 (const ChafaPixel *pixels, const ChafaColorPair *color_pair, const guint8 *cov) G_GNUC_PURE; #endif +#ifdef HAVE_AVX2_INTRINSICS +gint calc_error_avx2 (const ChafaPixel *pixels, const ChafaColorPair *color_pair, + const guint32 *sym_mask_u32) G_GNUC_PURE; +#endif + #if defined(HAVE_POPCNT64_INTRINSICS) || defined(HAVE_POPCNT32_INTRINSICS) #define HAVE_POPCNT_INTRINSICS #endif