From: "Michael R. Crusoe" <crusoe@debian.org>
Date: Mon, 1 Jul 2024 12:50:58 +0200
Subject: Enable building on any 64-bit little-endian architecture
Forwarded: https://github.com/torognes/vsearch/pull/566

Using the SIMD Everywhere (SIMDe) library
---
 configure.ac      |  2 ++
 src/Makefile.am   | 25 +++++++++----------------
 src/align_simd.cc |  2 +-
 src/cpu.cc        | 17 +++++++++++++----
 src/vsearch.cc    |  2 +-
 src/vsearch.h     | 13 +++++++------
 6 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/configure.ac b/configure.ac
index 0e93bc7..ae1aa45 100644
--- a/configure.ac
+++ b/configure.ac
@@ -84,6 +84,7 @@ have_man_html=no
 case $target in
      aarch64*) target_aarch64="yes" ;;
      powerpc64*) target_ppc="yes" ;;
+     x86_64*) target_x86_64="yes" ;;
 esac
 
 AC_CHECK_HEADERS([windows.h], [AM_CONDITIONAL(TARGET_WIN, true)], [AM_CONDITIONAL(TARGET_WIN, false)])
@@ -92,6 +93,7 @@ AM_CONDITIONAL(HAVE_PS2PDF, test "x${have_ps2pdf}" = "xyes")
 AM_CONDITIONAL(HAVE_MAN_HTML, test "x${have_man_html}" = "xyes")
 AM_CONDITIONAL(TARGET_PPC, test "x${target_ppc}" = "xyes")
 AM_CONDITIONAL(TARGET_AARCH64, test "x${target_aarch64}" = "xyes")
+AM_CONDITIONAL(TARGET_X86_64, test "x${target_x86_64}" = "xyes")
 AM_PROG_CC_C_O
 
 AC_CONFIG_FILES([Makefile
diff --git a/src/Makefile.am b/src/Makefile.am
index 45ba8ad..6ed3f32 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -8,9 +8,11 @@ else
 if TARGET_AARCH64
 AM_CFLAGS += -march=armv8-a+simd -mtune=generic
 else
+if TARGET_X86_64
 AM_CFLAGS += -march=x86-64 -mtune=generic
 endif
 endif
+endif
 
 # Conditionally set NDEBUG based on ENABLE_DEBUG
 if ENABLE_DEBUG
@@ -86,20 +88,15 @@ util.h \
 vsearch.h \
 xstring.h
 
-if TARGET_PPC
-libcpu_a_SOURCES = cpu.cc $(VSEARCHHEADERS)
-noinst_LIBRARIES = libcpu.a libcityhash.a
-else
-if TARGET_AARCH64
-libcpu_a_SOURCES = cpu.cc $(VSEARCHHEADERS)
-noinst_LIBRARIES = libcpu.a libcityhash.a
-else
+if TARGET_X86_64
 libcpu_sse2_a_SOURCES = cpu.cc $(VSEARCHHEADERS)
 libcpu_sse2_a_CXXFLAGS = $(AM_CXXFLAGS) -msse2
 libcpu_ssse3_a_SOURCES = cpu.cc $(VSEARCHHEADERS)
 libcpu_ssse3_a_CXXFLAGS = $(AM_CXXFLAGS) -mssse3 -DSSSE3
 noinst_LIBRARIES = libcpu_sse2.a libcpu_ssse3.a libcityhash.a
-endif
+else
+libcpu_a_SOURCES = cpu.cc $(VSEARCHHEADERS)
+noinst_LIBRARIES = libcpu.a libcityhash.a
 endif
 
 libcityhash_a_SOURCES = city.cc city.h
@@ -114,14 +111,10 @@ else
 
 libcityhash_a_CXXFLAGS = $(AM_CXXFLAGS) -Wno-sign-compare
 
-if TARGET_PPC
-__top_builddir__bin_vsearch_LDADD = libcityhash.a libcpu.a
-else
-if TARGET_AARCH64
-__top_builddir__bin_vsearch_LDADD = libcityhash.a libcpu.a
-else
+if TARGET_X86_64
 __top_builddir__bin_vsearch_LDADD = libcityhash.a libcpu_ssse3.a libcpu_sse2.a
-endif
+else
+__top_builddir__bin_vsearch_LDADD = libcityhash.a libcpu.a
 endif
 
 endif
diff --git a/src/align_simd.cc b/src/align_simd.cc
index fd047bb..7cc8a0f 100644
--- a/src/align_simd.cc
+++ b/src/align_simd.cc
@@ -159,7 +159,7 @@ const uint16x8_t neon_mask =
 #define v_shift_left(a) vextq_s16((v_zero), (a), 7)
 #define v_mask_gt(a, b) vaddvq_u16(vandq_u16((vcgtq_s16((a), (b))), neon_mask))
 
-#elif __x86_64__
+#elif defined(__x86_64__) || defined(SIMDE_VERSION)
 
 typedef __m128i VECTOR_SHORT;
 
diff --git a/src/cpu.cc b/src/cpu.cc
index 8801879..21c4ae9 100644
--- a/src/cpu.cc
+++ b/src/cpu.cc
@@ -150,11 +150,20 @@ void increment_counters_from_bitmap(count_t * counters,
     }
 }
 
-#elif __x86_64__
+#elif __x86_64__ || defined(SIMDE_VERSION)
 
+#ifdef __x86_64__
 #include <emmintrin.h>
+#else
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse2.h>
+#endif
 
-#ifdef SSSE3
+#if defined(SIMDE_VERSION)
+void increment_counters_from_bitmap(count_t * counters,
+                                    unsigned char * bitmap,
+                                    unsigned int totalbits)
+#elif defined(SSSE3)
 void increment_counters_from_bitmap_ssse3(count_t * counters,
                                           unsigned char * bitmap,
                                           unsigned int totalbits)
@@ -189,7 +198,7 @@ void increment_counters_from_bitmap_sse2(count_t * counters,
   // 0xf7fbfdfe -> 1111'0111'1111'1011'1111'1101'1111'1110 (32 bits)
   static constexpr auto mask2 = static_cast<int32_t>(0xf7fbfdfe);
 
-#ifdef SSSE3
+#if defined(SSSE3) || defined(SIMDE_VERSION)
   const auto c1 = _mm_set_epi32(0x01010101, 0x01010101, 0x00000000, 0x00000000);
 #endif
   const auto c2 = _mm_set_epi32(mask1, mask2, mask1, mask2);
@@ -202,7 +211,7 @@ void increment_counters_from_bitmap_sse2(count_t * counters,
   for(auto j = 0U; j < r; j++)
     {
       const auto xmm0 = _mm_loadu_si128((__m128i *)p++);
-#ifdef SSSE3
+#if defined(SSSE3) || defined(SIMDE_VERSION)
       const auto xmm1 = _mm_shuffle_epi8(xmm0, c1);
 #else
       const auto xmm6 = _mm_unpacklo_epi8(xmm0, xmm0);
diff --git a/src/vsearch.cc b/src/vsearch.cc
index 39b7ef0..fd1c2f2 100644
--- a/src/vsearch.cc
+++ b/src/vsearch.cc
@@ -388,7 +388,7 @@ void cpu_features_detect()
         }
     }
 #else
-#error Unknown architecture
+    // simde
 #endif
 }
 
diff --git a/src/vsearch.h b/src/vsearch.h
index a0bd406..0cc9c12 100644
--- a/src/vsearch.h
+++ b/src/vsearch.h
@@ -106,13 +106,13 @@
 
 #ifdef __x86_64__
 
-#define PROG_CPU "x86_64"
+#define PROG_CPU x86_64
 #include <x86intrin.h>
 
 #elif __PPC__
 
 #ifdef __LITTLE_ENDIAN__
-#define PROG_CPU "ppc64le"
+#define PROG_CPU ppc64le
 #include <altivec.h>
 #undef bool
 #else
@@ -121,13 +121,14 @@
 
 #elif __aarch64__
 
-#define PROG_CPU "aarch64"
+#define PROG_CPU aarch64
 #include <arm_neon.h>
 
 #else
 
-#error Unknown architecture (not ppc64le, aarch64 or x86_64)
-
+#define PROG_CPU simde
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/avx512.h>
 #endif
 
 
@@ -190,7 +191,7 @@
 #endif
 
 
-#define PROG_ARCH PROG_OS "_" PROG_CPU
+#define PROG_ARCH PROG_OS "_" "PROG_CPU"
 
 #ifdef HAVE_DLFCN_H
 #include <dlfcn.h>
