From: Michael R. Crusoe <crusoe@debian.org>
Subject: Add non-x86 portability using SIMD Everywhere
--- rapmap.orig/src/metro/metrohash128crc.cpp
+++ rapmap/src/metro/metrohash128crc.cpp
@@ -24,7 +24,8 @@
 //
 
 
-#include <nmmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse4.2.h"
 #include <string.h>
 #include "metro/metrohash.h"
 #include "metro/platform.h"
--- rapmap.orig/src/ksw2pp/ksw2_extd2_sse.c
+++ rapmap/src/ksw2pp/ksw2_extd2_sse.c
@@ -3,29 +3,19 @@
 #include <assert.h>
 #include "ksw2pp/ksw2.h"
 
-#ifdef __SSE2__
-#include <emmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse4.1.h"
 
-#ifdef KSW_SSE2_ONLY
-#undef __SSE4_1__
-#endif
-
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
-
-#ifdef KSW_CPU_DISPATCH
 #ifdef __SSE4_1__
 void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#else
+#elif defined(__SSE2__)
 void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#endif
 #else
 void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#endif // ~KSW_CPU_DISPATCH
+#endif
 {
 #define __dp_code_block1 \
 	z = _mm_load_si128(&s[t]); \
@@ -161,13 +151,8 @@
 				st = _mm_loadu_si128((__m128i*)&qrr[t]);
 				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
 				tmp = _mm_cmpeq_epi8(sq, st);
-#ifdef __SSE4_1__
 				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
 				tmp = _mm_blendv_epi8(tmp,     sc_N_,   mask);
-#else
-				tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
-				tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
-#endif
 				_mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
 			}
 		} else {
@@ -184,7 +169,6 @@
 			for (t = st_; t <= en_; ++t) {
 				__m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);
 				z = _mm_max_epi8(z, b);
 				z = _mm_max_epi8(z, a2);
@@ -195,27 +179,6 @@
 				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_max_epi8(b,  zero_), qe_));
 				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, zero_), qe2_));
 				_mm_store_si128(&y2[t], _mm_sub_epi8(_mm_max_epi8(b2, zero_), qe2_));
-#else
-				tmp = _mm_cmpgt_epi8(a,  z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(b,  z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(a2, z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2));
-				tmp = _mm_cmpgt_epi8(b2, z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
-				tmp = _mm_cmplt_epi8(sc_mch_, z);
-				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
-				__dp_code_block2;
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_and_si128(tmp, b),  qe_));
-				tmp = _mm_cmpgt_epi8(a2, zero_);
-				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_));
-				tmp = _mm_cmpgt_epi8(b2, zero_);
-				_mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_));
-#endif
 			}
 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
 			__m128i *pr = p + (size_t)r * n_col_ - st_;
@@ -223,7 +186,6 @@
 			for (t = st_; t <= en_; ++t) {
 				__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
 				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1));       // d = a  > z? 1 : 0
 				z = _mm_max_epi8(z, a);
 				d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b,  z)); // d = b  > z? 2 : d
@@ -233,22 +195,6 @@
 				d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d
 				z = _mm_max_epi8(z, b2);
 				z = _mm_min_epi8(z, sc_mch_);
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				tmp = _mm_cmpgt_epi8(a,  z);
-				d = _mm_and_si128(tmp, _mm_set1_epi8(1));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(b,  z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(a2, z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2));
-				tmp = _mm_cmpgt_epi8(b2, z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4)));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
-				tmp = _mm_cmplt_epi8(sc_mch_, z);
-				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
-#endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(a, zero_);
 				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
@@ -270,7 +216,6 @@
 			for (t = st_; t <= en_; ++t) {
 				__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
 				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1));    // d = z > a?  0 : 1
 				z = _mm_max_epi8(z, a);
 				d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b));  // d = z > b?  d : 2
@@ -280,22 +225,6 @@
 				d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4
 				z = _mm_max_epi8(z, b2);
 				z = _mm_min_epi8(z, sc_mch_);
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				tmp = _mm_cmpgt_epi8(z, a);
-				d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(z, b);
-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(z, a2);
-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2));
-				tmp = _mm_cmpgt_epi8(z, b2);
-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4)));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2));
-				tmp = _mm_cmplt_epi8(sc_mch_, z);
-				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
-#endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(zero_, a);
 				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_andnot_si128(tmp, a),  qe_));
@@ -330,13 +259,8 @@
 					_mm_storeu_si128((__m128i*)&H[t], H1);
 					t_ = _mm_set1_epi32(t);
 					tmp = _mm_cmpgt_epi32(H1, max_H_);
-#ifdef __SSE4_1__
 					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
 					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
-#else
-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
-#endif
 				}
 				_mm_storeu_si128((__m128i*)HH, max_H_);
 				_mm_storeu_si128((__m128i*)tt, max_t_);
@@ -391,4 +315,3 @@
 		kfree(km, mem2); kfree(km, off);
 	}
 }
-#endif // __SSE2__
--- rapmap.orig/src/ksw2pp/ksw2_extf2_sse.c
+++ rapmap/src/ksw2pp/ksw2_extf2_sse.c
@@ -1,22 +1,16 @@
 #include <string.h>
 #include "ksw2pp/ksw2.h"
 
-#ifdef __SSE2__
-#include <emmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse4.1.h"
 
 #ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
-
-#ifdef KSW_CPU_DISPATCH
-#ifdef __SSE4_1__
 void ksw_extf2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez)
-#else
-  void ksw_extf2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez)
-#endif
+#elif defined(__SSE2__)
+void ksw_extf2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez)
 #else
 void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez)
-#endif // ~KSW_CPU_DISPATCH
+#endif
 {
 	int32_t r, t, tlen_, qlen_, last_st, last_en, H0 = 0, last_H0_t = 0;
 	uint8_t *qr, *sf, *mem;
@@ -60,11 +54,7 @@
 			sq = _mm_loadu_si128((__m128i*)&sf[t]);
 			st = _mm_loadu_si128((__m128i*)&qrr[t]);
 			tmp = _mm_cmpeq_epi8(sq, st);
-#ifdef __SSE4_1__
 			tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
-#else
-			tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
-#endif
 			_mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
 		}
 		for (t = st_; t <= en_; ++t) {
@@ -75,12 +65,7 @@
 			vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); // vt1 <- v[r-1][t-1..t+14]
 			v1_ = tmp;
 			ut = _mm_load_si128(&u[t]);                      // ut <- u[t..t+15]
-#ifdef __SSE4_1__
 			z = _mm_max_epi8(z, vt1);                        // z = z > a? z : a (signed)
-#else
-			z = _mm_and_si128(z, _mm_cmpgt_epi8(z, _mm_setzero_si128()));  // z = z > 0? z : 0;
-			z = _mm_max_epu8(z, vt1);                        // z = max(z, a); this works because both are non-negative
-#endif
 			z = _mm_max_epu8(z, ut);                         // z = max(z, b); this works because both are non-negative
 			_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1));    // u[r][t..t+15] <- z - v[r-1][t-1..t+14]
 			_mm_store_si128(&v[t], _mm_sub_epi8(z, ut));     // v[r][t..t+15] <- z - u[r-1][t..t+15]
@@ -104,4 +89,3 @@
 	else ez->zdropped = 1;
 	kfree(km, mem);
 }
-#endif // __SSE2__
--- rapmap.orig/src/ksw2pp/ksw2_exts2_sse.c
+++ rapmap/src/ksw2pp/ksw2_exts2_sse.c
@@ -3,29 +3,19 @@
 #include <assert.h>
 #include "ksw2pp/ksw2.h"
 
-#ifdef __SSE2__
-#include <emmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse4.1.h"
 
-#ifdef KSW_SSE2_ONLY
-#undef __SSE4_1__
-#endif
-
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
-
-#ifdef KSW_CPU_DISPATCH
 #ifdef __SSE4_1__
 void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
-#else
+#elif defined(__SSE2__)
 void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
-#endif
 #else
 void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
-#endif // ~KSW_CPU_DISPATCH
+#endif
 {
 #define __dp_code_block1 \
 	z = _mm_load_si128(&s[t]); \
@@ -161,13 +151,8 @@
 				st = _mm_loadu_si128((__m128i*)&qrr[t]);
 				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
 				tmp = _mm_cmpeq_epi8(sq, st);
-#ifdef __SSE4_1__
 				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
 				tmp = _mm_blendv_epi8(tmp,     sc_N_,   mask);
-#else
-				tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
-				tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
-#endif
 				_mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
 			}
 		} else {
@@ -184,7 +169,6 @@
 			for (t = st_; t <= en_; ++t) {
 				__m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);
 				z = _mm_max_epi8(z, b);
 				z = _mm_max_epi8(z, a2a);
@@ -193,23 +177,6 @@
 				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_max_epi8(b,  zero_), qe_));
 				tmp = _mm_load_si128(&donor[t]);
 				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, tmp), q2_));
-#else
-				tmp = _mm_cmpgt_epi8(a,  z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(b,  z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(a2a, z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
-				__dp_code_block2;
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_and_si128(tmp, b),  qe_));
-				tmp = _mm_load_si128(&donor[t]); // TODO: check if this is correct
-				tmp = _mm_cmpgt_epi8(a2, tmp);
-				tmp = _mm_or_si128(_mm_andnot_si128(tmp, tmp), _mm_and_si128(tmp, a2));
-				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp, q2_));
-#endif
 			}
 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
 			__m128i *pr = p + r * n_col_ - st_;
@@ -217,24 +184,12 @@
 			for (t = st_; t <= en_; ++t) {
 				__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
 				__dp_code_block1;
-#ifdef __SSE4_1__
 				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1));       // d = a  > z? 1 : 0
 				z = _mm_max_epi8(z, a);
 				d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b,  z)); // d = b  > z? 2 : d
 				z = _mm_max_epi8(z, b);
 				d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d
 				z = _mm_max_epi8(z, a2a);
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				tmp = _mm_cmpgt_epi8(a,  z);
-				d = _mm_and_si128(tmp, _mm_set1_epi8(1));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(b,  z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(a2a, z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
-#endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(a, zero_);
 				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
@@ -245,11 +200,7 @@
 
 				tmp2 = _mm_load_si128(&donor[t]);
 				tmp = _mm_cmpgt_epi8(a2, tmp2);
-#ifdef __SSE4_1__
 				tmp2 = _mm_max_epi8(a2, tmp2);
-#else
-				tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, tmp2), _mm_and_si128(tmp, a2));
-#endif
 				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
 				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20)));
 				_mm_store_si128(&pr[t], d);
@@ -260,24 +211,12 @@
 			for (t = st_; t <= en_; ++t) {
 				__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
 				__dp_code_block1;
-#ifdef __SSE4_1__
 				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1));    // d = z > a?  0 : 1
 				z = _mm_max_epi8(z, a);
 				d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b));  // d = z > b?  d : 2
 				z = _mm_max_epi8(z, b);
 				d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3
 				z = _mm_max_epi8(z, a2a);
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				tmp = _mm_cmpgt_epi8(z, a);
-				d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(z, b);
-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(z, a2a);
-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2a));
-#endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(zero_, a);
 				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_andnot_si128(tmp, a),  qe_));
@@ -288,11 +227,7 @@
 
 				tmp2 = _mm_load_si128(&donor[t]);
 				tmp = _mm_cmpgt_epi8(tmp2, a2);
-#ifdef __SSE4_1__
 				tmp2 = _mm_max_epi8(tmp2, a2);
-#else
-				tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, a2), _mm_and_si128(tmp, tmp2));
-#endif
 				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
 				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
 				_mm_store_si128(&pr[t], d);
@@ -316,13 +251,8 @@
 					_mm_storeu_si128((__m128i*)&H[t], H1);
 					t_ = _mm_set1_epi32(t);
 					tmp = _mm_cmpgt_epi32(H1, max_H_);
-#ifdef __SSE4_1__
 					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
 					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
-#else
-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
-#endif
 				}
 				_mm_storeu_si128((__m128i*)HH, max_H_);
 				_mm_storeu_si128((__m128i*)tt, max_t_);
@@ -373,4 +303,3 @@
 		kfree(km, mem2); kfree(km, off);
 	}
 }
-#endif // __SSE2__
--- rapmap.orig/src/ksw2pp/ksw2_extz2_sse.c
+++ rapmap/src/ksw2pp/ksw2_extz2_sse.c
@@ -2,26 +2,16 @@
 #include <assert.h>
 #include "ksw2pp/ksw2.h"
 
-#ifdef __SSE2__
-#include <emmintrin.h>
-
-#ifdef KSW_SSE2_ONLY
-#undef __SSE4_1__
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse4.1.h"
 
 #ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
-
-#ifdef KSW_CPU_DISPATCH
-#ifdef __SSE4_1__
 void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#else
+#elif defined(__SSE2__)
 void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#endif
 #else
 void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#endif // ~KSW_CPU_DISPATCH
+#endif
 {
 #define __dp_code_block1 \
 	z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \
@@ -129,13 +119,8 @@
 				st = _mm_loadu_si128((__m128i*)&qrr[t]);
 				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
 				tmp = _mm_cmpeq_epi8(sq, st);
-#ifdef __SSE4_1__
 				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
 				tmp = _mm_blendv_epi8(tmp,     sc_N_,   mask);
-#else
-				tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
-				tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
-#endif
 				_mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
 			}
 		} else {
@@ -151,22 +136,10 @@
 			for (t = st_; t <= en_; ++t) {
 				__m128i z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-#endif
 				__dp_code_block2;
-#ifdef __SSE4_1__
 				_mm_store_si128(&x[t], _mm_max_epi8(a, zero_));
 				_mm_store_si128(&y[t], _mm_max_epi8(b, zero_));
-#else
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t], _mm_and_si128(a, tmp));
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t], _mm_and_si128(b, tmp));
-#endif
 			}
 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
 			__m128i *pr = p + (size_t)r * n_col_ - st_;
@@ -175,16 +148,9 @@
 				__m128i d, z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
 				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
 				tmp = _mm_cmpgt_epi8(b, z);
 				d = _mm_blendv_epi8(d, flag2_, tmp);             // d = b > z? 2 : d
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-				tmp = _mm_cmpgt_epi8(b, z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
-#endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(a, zero_);
 				_mm_store_si128(&x[t], _mm_and_si128(tmp, a));
@@ -201,16 +167,9 @@
 				__m128i d, z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
 				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
 				tmp = _mm_cmpgt_epi8(z, b);
 				d = _mm_blendv_epi8(flag2_, d, tmp);             // d = z > b? d : 2
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-				tmp = _mm_cmpgt_epi8(z, b);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv
-#endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(zero_, a);
 				_mm_store_si128(&x[t], _mm_andnot_si128(tmp, a));
@@ -241,13 +200,8 @@
 					_mm_storeu_si128((__m128i*)&H[t], H1);
 					t_ = _mm_set1_epi32(t);
 					tmp = _mm_cmpgt_epi32(H1, max_H_);
-#ifdef __SSE4_1__
 					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
 					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
-#else
-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
-#endif
 				}
 				_mm_storeu_si128((__m128i*)HH, max_H_);
 				_mm_storeu_si128((__m128i*)tt, max_t_);
@@ -302,4 +256,3 @@
 		kfree(km, mem2); kfree(km, off);
 	}
 }
-#endif // __SSE2__
--- rapmap.orig/src/ksw2pp/ksw2_gg2_sse.c
+++ rapmap/src/ksw2pp/ksw2_gg2_sse.c
@@ -1,12 +1,8 @@
 #include <stdio.h> // for debugging only
 #include "ksw2pp/ksw2.h"
 
-#ifdef __SSE2__
-#include <emmintrin.h>
-
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse4.1.h"
 
 int ksw_gg2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
 {
@@ -86,16 +82,9 @@
 			b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); // b <- y[r-1][t..t+15] + u[r-1][t..t+15]
 
 			d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
-#ifdef __SSE4_1__
 			z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
 			tmp = _mm_cmpgt_epi8(b, z);
 			d = _mm_blendv_epi8(d, flag2_, tmp);             // d = b > z? 2 : d
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-			z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-			z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-			tmp = _mm_cmpgt_epi8(b, z);
-			d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
-#endif
 			z = _mm_max_epu8(z, b);                          // z = max(z, b); this works because both are non-negative
 			_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); // u[r][t..t+15] <- z - v[r-1][t-1..t+14]
 			_mm_store_si128(&v[t], _mm_sub_epi8(z, ut));  // v[r][t..t+15] <- z - u[r-1][t..t+15]
@@ -124,4 +113,3 @@
 	kfree(km, mem2); kfree(km, off);
 	return H0;
 }
-#endif // __SSE2__
--- rapmap.orig/src/ksw2pp/KSW2Aligner.cpp
+++ rapmap/src/ksw2pp/KSW2Aligner.cpp
@@ -27,10 +27,12 @@
 	asm volatile ("cpuid"
 			: "=a" (cpuid[0]), "=b" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3])
 			: "0" (func_id), "2" (subfunc_id));
-#else // on 32bit, ebx can NOT be used as PIC code
+#elif defined(__i386__) // on 32bit, ebx can NOT be used as PIC code
 	asm volatile ("xchgl %%ebx, %1; cpuid; xchgl %%ebx, %1"
 			: "=a" (cpuid[0]), "=r" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3])
 			: "0" (func_id), "2" (subfunc_id));
+#else
+	cpuid[0] = 0;
 #endif
 }
 #endif
--- rapmap.orig/src/CMakeLists.txt
+++ rapmap/src/CMakeLists.txt
@@ -87,20 +87,36 @@
 check_ipo_supported(RESULT HAS_IPOHAS_IPO)
 
 add_library(ksw2pp_sse2 OBJECT ${KSW2PP_ADVANCED_LIB_SRCS})
-add_library(ksw2pp_sse4 OBJECT ${KSW2PP_ADVANCED_LIB_SRCS})
 add_library(ksw2pp_basic OBJECT ${KSW2PP_BASIC_LIB_SRCS})
+set_target_properties(ksw2pp_basic PROPERTIES INCLUDE_DIRECTORIES ${GAT_SOURCE_DIR}/include)
 
-set_target_properties(ksw2pp_sse2 PROPERTIES COMPILE_FLAGS "-O3 -msse2 -mno-sse4.1")
-set_target_properties(ksw2pp_sse2 PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;KSW_SSE2_ONLY;HAVE_KALLOC")
-set_target_properties(ksw2pp_sse4 PROPERTIES COMPILE_FLAGS "-O3 -msse4.1")
-set_target_properties(ksw2pp_sse4 PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
-set_target_properties(ksw2pp_basic PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
+if(NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
+  EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE )
+  set(CMAKE_SYSTEM_PROCESSOR "${ARCHITECTURE}")
+endif()
 
-set_target_properties(ksw2pp_basic PROPERTIES INCLUDE_DIRECTORIES ${GAT_SOURCE_DIR}/include)
-set_target_properties(ksw2pp_sse4 PROPERTIES INCLUDE_DIRECTORIES ${GAT_SOURCE_DIR}/include)
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+    set(CMAKE_SYSTEM_PROCESSOR "amd64")
+elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "i686")
+    set(CMAKE_SYSTEM_PROCESSOR "i386")
+endif()
+
+message("CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+
+if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "amd64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i386")
+  add_library(ksw2pp_sse4 OBJECT ${KSW2PP_ADVANCED_LIB_SRCS})
+  set_target_properties(ksw2pp_sse2 PROPERTIES COMPILE_FLAGS "-O3 -msse2 -mno-sse4.1")
+  set_target_properties(ksw2pp_sse2 PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;KSW_SSE2_ONLY;HAVE_KALLOC")
+  set_target_properties(ksw2pp_sse4 PROPERTIES COMPILE_FLAGS "-O3 -msse4.1")
+  set_target_properties(ksw2pp_sse4 PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
+  set_target_properties(ksw2pp_basic PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
+  set_target_properties(ksw2pp_sse4 PROPERTIES INCLUDE_DIRECTORIES ${GAT_SOURCE_DIR}/include)
+  add_library(ksw2pp STATIC $<TARGET_OBJECTS:ksw2pp_sse2> $<TARGET_OBJECTS:ksw2pp_sse4> $<TARGET_OBJECTS:ksw2pp_basic>)
+else()
+  add_library(ksw2pp STATIC $<TARGET_OBJECTS:ksw2pp_sse2> $<TARGET_OBJECTS:ksw2pp_basic>)
+endif()
 
 # Build the ksw2pp library
-add_library(ksw2pp STATIC $<TARGET_OBJECTS:ksw2pp_sse2> $<TARGET_OBJECTS:ksw2pp_sse4> $<TARGET_OBJECTS:ksw2pp_basic>)
 set_target_properties(ksw2pp PROPERTIES COMPILE_DEFINITIONS "KSW_CPU_DISPATCH;HAVE_KALLOC")
 if(HAS_IPO)
   set_property(TARGET ksw2pp PROPERTY INTERPROCEDURAL_OPTIMIZATION True)
