core/conv: add x86 SSE support for Viterbi decoder

Fast convolutional decoding is provided through x86 intrinsic based SSE operations. SSE3, found on virtually all modern x86 processors, is the minimal requirement. SSE4.1 and AVX2 are used if available. Also, the original code was extended with runtime SIMD detection, so only supported extensions will be used by target CPU. It makes the library more partable, what is very important for binary packages distribution. Runtime SIMD detection is currently implemented through the __builtin_cpu_supports call. Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351
author: Tom Tsou <tom.tsou@ettus.com> 2017-04-29 00:16:43 +0700
committer: Harald Welte <laforge@gnumonks.org> 2017-05-24 22:04:53 +0000
commit: 34e228a9bcf3ac37287bb5e684ace46818740f3b (patch)
tree: ebb7bcd4dd9c494106096384327db0122a4fde01
parent: b6c8dda5e34df6b74183ad24cf66c98601065e56 (diff)
4 files changed, 748 insertions, 10 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 57240550..a0aa5a0c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -23,6 +23,12 @@ libosmocore_la_SOURCES = timer.c timer_gettimeofday.c select.c signal.c msgb.c b
 			 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
 			 viterbi.c viterbi_gen.c sercomm.c
 
+if HAVE_SSE3
+libosmocore_la_SOURCES += viterbi_sse.c
+# Per-object flags hack
+viterbi_sse.lo : CFLAGS += $(SIMD_FLAGS)
+endif
+
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
 
 if ENABLE_PLUGIN
diff --git a/src/viterbi.c b/src/viterbi.c
index 21c6a578..2097a02d 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c
@@ -24,12 +24,35 @@
 #include <string.h>
 #include <errno.h>
 
-#include <osmocom/core/conv.h>
 #include "config.h"
 
+#include <osmocom/core/conv.h>
+
 #define BIT2NRZ(REG,N)	(((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)	(K == 7 ? 64 : 16)
 
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointers will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_vdec_malloc(size_t n);
+void osmo_conv_vdec_free(int16_t *ptr);
+
+#ifdef HAVE_SSE3
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
+void osmo_conv_vdec_free_sse3(int16_t *ptr);
+#endif
+
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
@@ -44,6 +67,21 @@ void osmo_conv_gen_metrics_k7_n3(const int8_t *seq, const int16_t *out,
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
 
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+#endif
+
 /* Trellis State
  * state - Internal lshift register value
  * prev  - Register values of previous 0 and 1 states
@@ -90,12 +128,6 @@ struct vdecoder {
 		int16_t *, int16_t *, int);
 };
 
-/* Non-aligned Memory Allocator */
-static int16_t *vdec_malloc(size_t n)
-{
-	return (int16_t *) malloc(sizeof(int16_t) * n);
-}
-
 /* Accessor calls */
 static inline int conv_code_recursive(const struct osmo_conv_code *code)
 {
@@ -294,9 +326,9 @@ static void free_trellis(struct vtrellis *trellis)
 	if (!trellis)
 		return;
 
+	vdec_free(trellis->outputs);
+	vdec_free(trellis->sums);
 	free(trellis->vals);
-	free(trellis->outputs);
-	free(trellis->sums);
 	free(trellis);
 }
 
@@ -430,7 +462,7 @@ static void free_vdec(struct vdecoder *dec)
 	if (!dec)
 		return;
 
-	free(dec->paths[0]);
+	vdec_free(dec->paths[0]);
 	free(dec->paths);
 	free_trellis(dec->trellis);
 	free(dec);
@@ -456,13 +488,31 @@ static struct vdecoder *alloc_vdec(const struct osmo_conv_code *code)
 	if (dec->k == 5) {
 		switch (dec->n) {
 		case 2:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k5_n2 :
+				osmo_conv_gen_metrics_k5_n2_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k5_n2;
+		#endif
 			break;
 		case 3:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k5_n3 :
+				osmo_conv_gen_metrics_k5_n3_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k5_n3;
+		#endif
 			break;
 		case 4:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k5_n4 :
+				osmo_conv_gen_metrics_k5_n4_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k5_n4;
+		#endif
 			break;
 		default:
 			goto fail;
@@ -470,13 +520,31 @@ static struct vdecoder *alloc_vdec(const struct osmo_conv_code *code)
 	} else if (dec->k == 7) {
 		switch (dec->n) {
 		case 2:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k7_n2 :
+				osmo_conv_gen_metrics_k7_n2_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k7_n2;
+		#endif
 			break;
 		case 3:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k7_n3 :
+				osmo_conv_gen_metrics_k7_n3_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k7_n3;
+		#endif
 			break;
 		case 4:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k7_n4 :
+				osmo_conv_gen_metrics_k7_n4_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k7_n4;
+		#endif
 			break;
 		default:
 			goto fail;
@@ -569,6 +637,36 @@ static int conv_decode(struct vdecoder *dec, const int8_t *seq,
 	return traceback(dec, out, term, len);
 }
 
+static void osmo_conv_init(void)
+{
+	init_complete = 1;
+
+#ifdef HAVE___BUILTIN_CPU_SUPPORTS
+	/* Detect CPU capabilities */
+	#ifdef HAVE_AVX2
+		avx2_supported = __builtin_cpu_supports("avx2");
+	#endif
+
+	#ifdef HAVE_SSE3
+		sse3_supported = __builtin_cpu_supports("sse3");
+	#endif
+
+	#ifdef HAVE_SSE4_1
+		sse41_supported = __builtin_cpu_supports("sse4.1");
+	#endif
+#endif
+
+#ifdef HAVE_SSE3
+	vdec_malloc = !sse3_supported ?
+		&osmo_conv_vdec_malloc : &osmo_conv_vdec_malloc_sse3;
+	vdec_free = !sse3_supported ?
+		&osmo_conv_vdec_free : &osmo_conv_vdec_free_sse3;
+#else
+	vdec_malloc = &osmo_conv_vdec_malloc;
+	vdec_free = &osmo_conv_vdec_free;
+#endif
+}
+
 /* All-in-one Viterbi decoding  */
 int osmo_conv_decode_acc(const struct osmo_conv_code *code,
 	const sbit_t *input, ubit_t *output)
@@ -576,6 +674,9 @@ int osmo_conv_decode_acc(const struct osmo_conv_code *code,
 	int rc;
 	struct vdecoder *vdec;
 
+	if (!init_complete)
+		osmo_conv_init();
+
 	if ((code->N < 2) || (code->N > 4) || (code->len < 1) ||
 		((code->K != 5) && (code->K != 7)))
 		return -EINVAL;
diff --git a/src/viterbi_gen.c b/src/viterbi_gen.c
index 7972c396..2ced6152 100644
--- a/src/viterbi_gen.c
+++ b/src/viterbi_gen.c
@@ -20,6 +20,7 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 
@@ -126,6 +127,19 @@ static void gen_path_metrics(int num_states, int16_t *sums,
 	memcpy(sums, new_sums, num_states * sizeof(int16_t));
 }
 
+/* Not-aligned Memory Allocator */
+__attribute__ ((visibility("hidden")))
+int16_t *osmo_conv_vdec_malloc(size_t n)
+{
+	return (int16_t *) malloc(sizeof(int16_t) * n);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_vdec_free(int16_t *ptr)
+{
+	free(ptr);
+}
+
 /* 16-state branch-path metrics units (K=5) */
 __attribute__ ((visibility("hidden")))
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
diff --git a/src/viterbi_sse.c b/src/viterbi_sse.c
new file mode 100644
index 00000000..5ca21b09
--- /dev/null
+++ b/src/viterbi_sse.c
@@ -0,0 +1,617 @@
+/*
+ * Intel SSE Viterbi decoder
+ *
+ * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
+ *
+ * All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdint.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+
+#include "config.h"
+
+#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
+	#include <smmintrin.h>
+#endif
+
+#ifdef HAVE_AVX2
+	#include <immintrin.h>
+#endif
+
+#define SSE_ALIGN 16
+
+extern int sse41_supported;
+extern int sse3_supported;
+extern int avx2_supported;
+
+/* Octo-Viterbi butterfly
+ * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
+ * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
+ * Two intermediate registers are used and results are set in the upper 4
+ * registers.
+ *
+ * Input:
+ * M0 - Path metrics 0 (packed 16-bit integers)
+ * M1 - Path metrics 1 (packed 16-bit integers)
+ * M2 - Branch metrics (packed 16-bit integers)
+ *
+ * Output:
+ * M2 - Selected and accumulated path metrics 0
+ * M4 - Selected and accumulated path metrics 1
+ * M3 - Path selections 0
+ * M1 - Path selections 1
+ */
+#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
+{ \
+	M3 = _mm_adds_epi16(M0, M2); \
+	M4 = _mm_subs_epi16(M1, M2); \
+	M0 = _mm_subs_epi16(M0, M2); \
+	M1 = _mm_adds_epi16(M1, M2); \
+	M2 = _mm_max_epi16(M3, M4); \
+	M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
+	M4 = _mm_max_epi16(M0, M1); \
+	M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
+}
+
+/* Two lane deinterleaving K = 5:
+ * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
+ * registers. The operation summarized below. Four registers are used with
+ * the lower 2 as input and upper 2 as output.
+ *
+ * In   - 10101010 10101010 10101010 10101010
+ * Out  - 00000000 11111111 00000000 11111111
+ *
+ * Input:
+ * M0:1 - Packed 16-bit integers
+ *
+ * Output:
+ * M2:3 - Deinterleaved packed 16-bit integers
+ */
+#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+
+#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
+{ \
+	M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
+	M0 = _mm_shuffle_epi8(M0, M2); \
+	M1 = _mm_shuffle_epi8(M1, M2); \
+	M2 = _mm_unpacklo_epi64(M0, M1); \
+	M3 = _mm_unpackhi_epi64(M0, M1); \
+}
+
+/* Two lane deinterleaving K = 7:
+ * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
+ * registers. The operation summarized below. 16 registers are used with the
+ * lower 8 as input and upper 8 as output.
+ *
+ * In   - 10101010 10101010 10101010 10101010 ...
+ * Out  - 00000000 11111111 00000000 11111111 ...
+ *
+ * Input:
+ * M0:7 - Packed 16-bit integers
+ *
+ * Output:
+ * M8:15 - Deinterleaved packed 16-bit integers
+ */
+#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
+	M8, M9, M10, M11, M12, M13, M14, M15) \
+{ \
+	M8  = _mm_set_epi8(_I8_SHUFFLE_MASK); \
+	M0  = _mm_shuffle_epi8(M0, M8); \
+	M1  = _mm_shuffle_epi8(M1, M8); \
+	M2  = _mm_shuffle_epi8(M2, M8); \
+	M3  = _mm_shuffle_epi8(M3, M8); \
+	M4  = _mm_shuffle_epi8(M4, M8); \
+	M5  = _mm_shuffle_epi8(M5, M8); \
+	M6  = _mm_shuffle_epi8(M6, M8); \
+	M7  = _mm_shuffle_epi8(M7, M8); \
+	M8  = _mm_unpacklo_epi64(M0, M1); \
+	M9  = _mm_unpackhi_epi64(M0, M1); \
+	M10 = _mm_unpacklo_epi64(M2, M3); \
+	M11 = _mm_unpackhi_epi64(M2, M3); \
+	M12 = _mm_unpacklo_epi64(M4, M5); \
+	M13 = _mm_unpackhi_epi64(M4, M5); \
+	M14 = _mm_unpacklo_epi64(M6, M7); \
+	M15 = _mm_unpackhi_epi64(M6, M7); \
+}
+
+/* Generate branch metrics N = 2:
+ * Compute 16 branch metrics from trellis outputs and input values.
+ *
+ * Input:
+ * M0:3 - 16 x 2 packed 16-bit trellis outputs
+ * M4   - Expanded and packed 16-bit input value
+ *
+ * Output:
+ * M6:7 - 16 computed 16-bit branch metrics
+ */
+#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
+{ \
+	M0 = _mm_sign_epi16(M4, M0); \
+	M1 = _mm_sign_epi16(M4, M1); \
+	M2 = _mm_sign_epi16(M4, M2); \
+	M3 = _mm_sign_epi16(M4, M3); \
+	M6 = _mm_hadds_epi16(M0, M1); \
+	M7 = _mm_hadds_epi16(M2, M3); \
+}
+
+/* Generate branch metrics N = 4:
+ * Compute 8 branch metrics from trellis outputs and input values. This
+ * macro is reused for N less than 4 where the extra soft input bits are
+ * padded.
+ *
+ * Input:
+ * M0:3 - 8 x 4 packed 16-bit trellis outputs
+ * M4   - Expanded and packed 16-bit input value
+ *
+ * Output:
+ * M5   - 8 computed 16-bit branch metrics
+ */
+#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
+{ \
+	M0 = _mm_sign_epi16(M4, M0); \
+	M1 = _mm_sign_epi16(M4, M1); \
+	M2 = _mm_sign_epi16(M4, M2); \
+	M3 = _mm_sign_epi16(M4, M3); \
+	M0 = _mm_hadds_epi16(M0, M1); \
+	M1 = _mm_hadds_epi16(M2, M3); \
+	M5 = _mm_hadds_epi16(M0, M1); \
+}
+
+/* Broadcast 16-bit integer
+ * Repeat the low 16-bit integer to all elements of the 128-bit SSE
+ * register. Only AVX2 has a dedicated broadcast instruction; use repeat
+ * unpacks for SSE only architectures. This is a destructive operation and
+ * the source register is overwritten.
+ *
+ * Input:
+ * M0 - Low 16-bit element is read
+ *
+ * Output:
+ * M0 - Contains broadcasted values
+ */
+#ifdef HAVE_AVX2
+#define SSE_BROADCAST(M0) \
+{ \
+	if (avx2_supported) { \
+		M0 = _mm_broadcastw_epi16(M0); \
+	} else { \
+		M0 = _mm_unpacklo_epi16(M0, M0); \
+		M0 = _mm_unpacklo_epi32(M0, M0); \
+		M0 = _mm_unpacklo_epi64(M0, M0); \
+	} \
+}
+#else
+#define SSE_BROADCAST(M0) \
+{ \
+	M0 = _mm_unpacklo_epi16(M0, M0); \
+	M0 = _mm_unpacklo_epi32(M0, M0); \
+	M0 = _mm_unpacklo_epi64(M0, M0); \
+}
+#endif
+
+/* Horizontal minimum
+ * Compute horizontal minimum of packed unsigned 16-bit integers and place
+ * result in the low 16-bit element of the source register. Only SSE 4.1
+ * has a dedicated minpos instruction. One intermediate register is used
+ * if SSE 4.1 is not available. This is a destructive operation and the
+ * source register is overwritten.
+ *
+ * Input:
+ * M0 - Packed unsigned 16-bit integers
+ *
+ * Output:
+ * M0 - Minimum value placed in low 16-bit element
+ */
+#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
+#define SSE_MINPOS(M0, M1) \
+{ \
+	if (sse41_supported) { \
+		M0 = _mm_minpos_epu16(M0); \
+	} else { \
+		M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+		M0 = _mm_min_epi16(M0, M1); \
+		M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+		M0 = _mm_min_epi16(M0, M1); \
+		M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
+		M0 = _mm_min_epi16(M0, M1); \
+	} \
+}
+#else
+#define SSE_MINPOS(M0, M1) \
+{ \
+	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+	M0 = _mm_min_epi16(M0, M1); \
+	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+	M0 = _mm_min_epi16(M0, M1); \
+	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
+	M0 = _mm_min_epi16(M0, M1); \
+}
+#endif
+
+/* Normalize state metrics K = 5:
+ * Compute 16-wide normalization by subtracting the smallest value from
+ * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
+ * Two intermediate registers are used and normalized results are placed
+ * in the originating locations.
+ *
+ * Input:
+ * M0:1 - Path metrics 0:1 (packed 16-bit integers)
+ *
+ * Output:
+ * M0:1 - Normalized path metrics 0:1
+ */
+#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
+{ \
+	M2 = _mm_min_epi16(M0, M1); \
+	SSE_MINPOS(M2, M3) \
+	SSE_BROADCAST(M2) \
+	M0 = _mm_subs_epi16(M0, M2); \
+	M1 = _mm_subs_epi16(M1, M2); \
+}
+
+/* Normalize state metrics K = 7:
+ * Compute 64-wide normalization by subtracting the smallest value from
+ * all values. Inputs are 8 registers of accumulated sums and 4 temporary
+ * registers. Normalized results are returned in the originating locations.
+ *
+ * Input:
+ * M0:7 - Path metrics 0:7 (packed 16-bit integers)
+ *
+ * Output:
+ * M0:7 - Normalized path metrics 0:7
+ */
+#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
+{ \
+	M8  = _mm_min_epi16(M0, M1); \
+	M9  = _mm_min_epi16(M2, M3); \
+	M10 = _mm_min_epi16(M4, M5); \
+	M11 = _mm_min_epi16(M6, M7); \
+	M8  = _mm_min_epi16(M8, M9); \
+	M10 = _mm_min_epi16(M10, M11); \
+	M8  = _mm_min_epi16(M8, M10); \
+	SSE_MINPOS(M8, M9) \
+	SSE_BROADCAST(M8) \
+	M0  = _mm_subs_epi16(M0, M8); \
+	M1  = _mm_subs_epi16(M1, M8); \
+	M2  = _mm_subs_epi16(M2, M8); \
+	M3  = _mm_subs_epi16(M3, M8); \
+	M4  = _mm_subs_epi16(M4, M8); \
+	M5  = _mm_subs_epi16(M5, M8); \
+	M6  = _mm_subs_epi16(M6, M8); \
+	M7  = _mm_subs_epi16(M7, M8); \
+}
+
+/* Combined BMU/PMU (K=5, N=2)
+ * Compute branch metrics followed by path metrics for half rate 16-state
+ * trellis. 8 butterflies are computed. Accumulated path sums are not
+ * preserved and read and written into the same memory location. Normalize
+ * sums if requires.
+ */
+__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6;
+
+	/* (BMU) Load input sequence */
+	m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load trellis outputs */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+
+	/* (BMU) Compute branch metrics */
+	m0 = _mm_sign_epi16(m2, m0);
+	m1 = _mm_sign_epi16(m2, m1);
+	m2 = _mm_hadds_epi16(m0, m1);
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+
+	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
+
+	/* (PMU) Butterflies: 0-7 */
+	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
+
+	if (norm)
+		SSE_NORMALIZE_K5(m2, m6, m0, m1)
+
+	_mm_store_si128((__m128i *) &sums[0], m2);
+	_mm_store_si128((__m128i *) &sums[8], m6);
+	_mm_store_si128((__m128i *) &paths[0], m5);
+	_mm_store_si128((__m128i *) &paths[8], m4);
+}
+
+/* Combined BMU/PMU (K=5, N=3 and N=4)
+ * Compute branch metrics followed by path metrics for 16-state and rates
+ * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
+ * values at a time, and extra values should be set to zero for rates other
+ * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
+ * dedicated implementation of rate 1/2.
+ */
+__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6;
+
+	/* (BMU) Load input sequence */
+	m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load trellis outputs */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+	m2 = _mm_load_si128((__m128i *) &out[16]);
+	m3 = _mm_load_si128((__m128i *) &out[24]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+
+	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
+
+	/* (PMU) Butterflies: 0-7 */
+	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
+
+	if (norm)
+		SSE_NORMALIZE_K5(m2, m6, m0, m1)
+
+	_mm_store_si128((__m128i *) &sums[0], m2);
+	_mm_store_si128((__m128i *) &sums[8], m6);
+	_mm_store_si128((__m128i *) &paths[0], m5);
+	_mm_store_si128((__m128i *) &paths[8], m4);
+}
+
+/* Combined BMU/PMU (K=7, N=2)
+ * Compute branch metrics followed by path metrics for half rate 64-state
+ * trellis. 32 butterfly operations are computed. Deinterleaving path
+ * metrics requires usage of the full SSE register file, so separate sums
+ * before computing branch metrics to avoid register spilling.
+ */
+__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
+		m9, m10, m11, m12, m13, m14, m15;
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+	m2 = _mm_load_si128((__m128i *) &sums[16]);
+	m3 = _mm_load_si128((__m128i *) &sums[24]);
+	m4 = _mm_load_si128((__m128i *) &sums[32]);
+	m5 = _mm_load_si128((__m128i *) &sums[40]);
+	m6 = _mm_load_si128((__m128i *) &sums[48]);
+	m7 = _mm_load_si128((__m128i *) &sums[56]);
+
+	/* (PMU) Deinterleave to even-odd registers */
+	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
+			    m8, m9, m10, m11, m12, m13, m14, m15)
+
+	/* (BMU) Load input symbols */
+	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load trellis outputs */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+	m2 = _mm_load_si128((__m128i *) &out[16]);
+	m3 = _mm_load_si128((__m128i *) &out[24]);
+
+	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
+
+	m0 = _mm_load_si128((__m128i *) &out[32]);
+	m1 = _mm_load_si128((__m128i *) &out[40]);
+	m2 = _mm_load_si128((__m128i *) &out[48]);
+	m3 = _mm_load_si128((__m128i *) &out[56]);
+
+	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
+
+	/* (PMU) Butterflies: 0-15 */
+	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
+	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
+
+	_mm_store_si128((__m128i *) &paths[0], m0);
+	_mm_store_si128((__m128i *) &paths[8], m2);
+	_mm_store_si128((__m128i *) &paths[32], m9);
+	_mm_store_si128((__m128i *) &paths[40], m11);
+
+	/* (PMU) Butterflies: 17-31 */
+	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
+	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
+
+	_mm_store_si128((__m128i *) &paths[16], m0);
+	_mm_store_si128((__m128i *) &paths[24], m9);
+	_mm_store_si128((__m128i *) &paths[48], m13);
+	_mm_store_si128((__m128i *) &paths[56], m15);
+
+	if (norm)
+		SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
+				 m7, m11, m0, m8, m9, m10)
+
+	_mm_store_si128((__m128i *) &sums[0], m4);
+	_mm_store_si128((__m128i *) &sums[8], m5);
+	_mm_store_si128((__m128i *) &sums[16], m6);
+	_mm_store_si128((__m128i *) &sums[24], m7);
+	_mm_store_si128((__m128i *) &sums[32], m1);
+	_mm_store_si128((__m128i *) &sums[40], m3);
+	_mm_store_si128((__m128i *) &sums[48], m2);
+	_mm_store_si128((__m128i *) &sums[56], m11);
+}
+
+/* Combined BMU/PMU (K=7, N=3 and N=4)
+ * Compute branch metrics followed by path metrics for half rate 64-state
+ * trellis. 32 butterfly operations are computed. Deinterleave path
+ * metrics before computing branch metrics as in the half rate case.
+ */
+__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6, m7;
+	__m128i m8, m9, m10, m11, m12, m13, m14, m15;
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+	m2 = _mm_load_si128((__m128i *) &sums[16]);
+	m3 = _mm_load_si128((__m128i *) &sums[24]);
+	m4 = _mm_load_si128((__m128i *) &sums[32]);
+	m5 = _mm_load_si128((__m128i *) &sums[40]);
+	m6 = _mm_load_si128((__m128i *) &sums[48]);
+	m7 = _mm_load_si128((__m128i *) &sums[56]);
+
+	/* (PMU) Deinterleave into even and odd packed registers */
+	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
+			    m8, m9, m10, m11, m12, m13, m14, m15)
+
+	/* (BMU) Load and expand 8-bit input out to 16-bits */
+	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load and compute branch metrics */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+	m2 = _mm_load_si128((__m128i *) &out[16]);
+	m3 = _mm_load_si128((__m128i *) &out[24]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
+
+	m0 = _mm_load_si128((__m128i *) &out[32]);
+	m1 = _mm_load_si128((__m128i *) &out[40]);
+	m2 = _mm_load_si128((__m128i *) &out[48]);
+	m3 = _mm_load_si128((__m128i *) &out[56]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
+
+	m0 = _mm_load_si128((__m128i *) &out[64]);
+	m1 = _mm_load_si128((__m128i *) &out[72]);
+	m2 = _mm_load_si128((__m128i *) &out[80]);
+	m3 = _mm_load_si128((__m128i *) &out[88]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
+
+	m0 = _mm_load_si128((__m128i *) &out[96]);
+	m1 = _mm_load_si128((__m128i *) &out[104]);
+	m2 = _mm_load_si128((__m128i *) &out[112]);
+	m3 = _mm_load_si128((__m128i *) &out[120]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
+
+	/* (PMU) Butterflies: 0-15 */
+	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
+	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
+
+	_mm_store_si128((__m128i *) &paths[0], m0);
+	_mm_store_si128((__m128i *) &paths[8], m2);
+	_mm_store_si128((__m128i *) &paths[32], m9);
+	_mm_store_si128((__m128i *) &paths[40], m11);
+
+	/* (PMU) Butterflies: 17-31 */
+	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
+	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
+
+	_mm_store_si128((__m128i *) &paths[16], m0);
+	_mm_store_si128((__m128i *) &paths[24], m9);
+	_mm_store_si128((__m128i *) &paths[48], m13);
+	_mm_store_si128((__m128i *) &paths[56], m15);
+
+	if (norm)
+		SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
+				 m7, m11, m0, m8, m9, m10)
+
+	_mm_store_si128((__m128i *) &sums[0], m4);
+	_mm_store_si128((__m128i *) &sums[8], m5);
+	_mm_store_si128((__m128i *) &sums[16], m6);
+	_mm_store_si128((__m128i *) &sums[24], m7);
+	_mm_store_si128((__m128i *) &sums[32], m1);
+	_mm_store_si128((__m128i *) &sums[40], m3);
+	_mm_store_si128((__m128i *) &sums[48], m2);
+	_mm_store_si128((__m128i *) &sums[56], m11);
+}
+
+/* Aligned Memory Allocator
+ * SSE requires 16-byte memory alignment. We store relevant trellis values
+ * (accumulated sums, outputs, and path decisions) as 16 bit signed integers
+ * so the allocated memory is casted as such.
+ */
+__attribute__ ((visibility("hidden")))
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n)
+{
+	return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_vdec_free_sse3(int16_t *ptr)
+{
+	_mm_free(ptr);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
+
+	_sse_metrics_k5_n2(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
+
+	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
+
+	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
+
+	_sse_metrics_k7_n2(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
+
+	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
+
+	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
+}
author	Tom Tsou <tom.tsou@ettus.com>	2017-04-29 00:16:43 +0700
committer	Harald Welte <laforge@gnumonks.org>	2017-05-24 22:04:53 +0000
commit	34e228a9bcf3ac37287bb5e684ace46818740f3b (patch)
tree	ebb7bcd4dd9c494106096384327db0122a4fde01
parent	b6c8dda5e34df6b74183ad24cf66c98601065e56 (diff)