From e4fe71c04fbe69581ba10f7b19158a4559b5a5d8 Mon Sep 17 00:00:00 2001
From: Vadim Yanitskiy <axilirator@gmail.com>
Date: Mon, 19 Jun 2017 17:59:48 +0700
Subject: core/conv: use proper filenames

We already have generic convolutional transcoding implementation
written by Sylvain Munaut and named 'conv.c', so 'viterbi_*' names
looked a bit confusing.

Let's use a single naming scheme for Viterbi related code.

Change-Id: I61062a8d1fbf5f5fc85b4fac58dc4e9fa8b5ef90
---
 src/Makefile.am          |  16 +-
 src/conv_acc.c           | 728 +++++++++++++++++++++++++++++++++++++++++++++++
 src/conv_acc_generic.c   | 207 ++++++++++++++
 src/conv_acc_sse.c       | 129 +++++++++
 src/conv_acc_sse_avx.c   | 129 +++++++++
 src/conv_acc_sse_impl.h  | 495 ++++++++++++++++++++++++++++++++
 src/viterbi.c            | 728 -----------------------------------------------
 src/viterbi_generic.c    | 207 --------------
 src/viterbi_sse.c        | 129 ---------
 src/viterbi_sse_avx.c    | 129 ---------
 src/viterbi_sse_common.h | 495 --------------------------------
 11 files changed, 1696 insertions(+), 1696 deletions(-)
 create mode 100644 src/conv_acc.c
 create mode 100644 src/conv_acc_generic.c
 create mode 100644 src/conv_acc_sse.c
 create mode 100644 src/conv_acc_sse_avx.c
 create mode 100644 src/conv_acc_sse_impl.h
 delete mode 100644 src/viterbi.c
 delete mode 100644 src/viterbi_generic.c
 delete mode 100644 src/viterbi_sse.c
 delete mode 100644 src/viterbi_sse_avx.c
 delete mode 100644 src/viterbi_sse_common.h

(limited to 'src')

diff --git a/src/Makefile.am b/src/Makefile.am
index e98c623c..692b699b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -21,28 +21,28 @@ libosmocore_la_SOURCES = timer.c timer_gettimeofday.c select.c signal.c msgb.c b
 			 conv.c application.c rbtree.c strrb.c \
 			 loggingrb.c crc8gen.c crc16gen.c crc32gen.c crc64gen.c \
 			 macaddr.c stat_item.c stats.c stats_statsd.c prim.c \
-			 viterbi.c viterbi_generic.c sercomm.c
+			 conv_acc.c conv_acc_generic.c sercomm.c
 
 if HAVE_SSE3
-libosmocore_la_SOURCES += viterbi_sse.c
+libosmocore_la_SOURCES += conv_acc_sse.c
 if HAVE_SSE4_1
-viterbi_sse.lo : CFLAGS += -msse3 -msse4.1
+conv_acc_sse.lo : CFLAGS += -msse3 -msse4.1
 else
-viterbi_sse.lo : CFLAGS += -msse3
+conv_acc_sse.lo : CFLAGS += -msse3
 endif
 
 if HAVE_AVX2
-libosmocore_la_SOURCES += viterbi_sse_avx.c
+libosmocore_la_SOURCES += conv_acc_sse_avx.c
 if HAVE_SSE4_1
-viterbi_sse_avx.lo : CFLAGS += -msse3 -mavx2 -msse4.1
+conv_acc_sse_avx.lo : CFLAGS += -msse3 -mavx2 -msse4.1
 else
-viterbi_sse_avx.lo : CFLAGS += -msse3 -mavx2
+conv_acc_sse_avx.lo : CFLAGS += -msse3 -mavx2
 endif
 endif
 endif
 
 BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
-EXTRA_DIST = viterbi_sse_common.h
+EXTRA_DIST = conv_acc_sse_impl.h
 
 if ENABLE_PLUGIN
 libosmocore_la_SOURCES += plugin.c
diff --git a/src/conv_acc.c b/src/conv_acc.c
new file mode 100644
index 00000000..308cfe07
--- /dev/null
+++ b/src/conv_acc.c
@@ -0,0 +1,728 @@
+/*
+ * Viterbi decoder
+ *
+ * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
+ *
+ * All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "config.h"
+
+#include <osmocom/core/conv.h>
+
+#define BIT2NRZ(REG,N)	(((REG >> N) & 0x01) * 2 - 1) * -1
+#define NUM_STATES(K)	(K == 7 ? 64 : 16)
+
+#define INIT_POINTERS(simd) \
+{ \
+	osmo_conv_metrics_k5_n2 = osmo_conv_##simd##_metrics_k5_n2; \
+	osmo_conv_metrics_k5_n3 = osmo_conv_##simd##_metrics_k5_n3; \
+	osmo_conv_metrics_k5_n4 = osmo_conv_##simd##_metrics_k5_n4; \
+	osmo_conv_metrics_k7_n2 = osmo_conv_##simd##_metrics_k7_n2; \
+	osmo_conv_metrics_k7_n3 = osmo_conv_##simd##_metrics_k7_n3; \
+	osmo_conv_metrics_k7_n4 = osmo_conv_##simd##_metrics_k7_n4; \
+	vdec_malloc = &osmo_conv_##simd##_vdec_malloc; \
+	vdec_free = &osmo_conv_##simd##_vdec_free; \
+}
+
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * These pointers are being initialized at runtime by the
+ * osmo_conv_init() depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+void (*osmo_conv_metrics_k5_n2)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+void (*osmo_conv_metrics_k5_n3)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+void (*osmo_conv_metrics_k5_n4)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+void (*osmo_conv_metrics_k7_n2)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+void (*osmo_conv_metrics_k7_n3)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+void (*osmo_conv_metrics_k7_n4)(const int8_t *seq,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_gen_vdec_malloc(size_t n);
+void osmo_conv_gen_vdec_free(int16_t *ptr);
+
+#if defined(HAVE_SSE3)
+int16_t *osmo_conv_sse_vdec_malloc(size_t n);
+void osmo_conv_sse_vdec_free(int16_t *ptr);
+#endif
+
+#if defined(HAVE_SSE3) && defined(HAVE_AVX2)
+int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n);
+void osmo_conv_sse_avx_vdec_free(int16_t *ptr);
+#endif
+
+/* Forward Metric Units */
+void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+
+#if defined(HAVE_SSE3)
+void osmo_conv_sse_metrics_k5_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_metrics_k5_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_metrics_k5_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_metrics_k7_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_metrics_k7_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_metrics_k7_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+#endif
+
+#if defined(HAVE_SSE3) && defined(HAVE_AVX2)
+void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+#endif
+
+/* Trellis State
+ * state - Internal lshift register value
+ * prev  - Register values of previous 0 and 1 states
+ */
+struct vstate {
+	unsigned state;
+	unsigned prev[2];
+};
+
+/* Trellis Object
+ * num_states - Number of states in the trellis
+ * sums       - Accumulated path metrics
+ * outputs    - Trellis output values
+ * vals       - Input value that led to each state
+ */
+struct vtrellis {
+	int num_states;
+	int16_t *sums;
+	int16_t *outputs;
+	uint8_t *vals;
+};
+
+/* Viterbi Decoder
+ * n         - Code order
+ * k         - Constraint length
+ * len       - Horizontal length of trellis
+ * recursive - Set to '1' if the code is recursive
+ * intrvl    - Normalization interval
+ * trellis   - Trellis object
+ * punc      - Puncturing sequence
+ * paths     - Trellis paths
+ */
+struct vdecoder {
+	int n;
+	int k;
+	int len;
+	int recursive;
+	int intrvl;
+	struct vtrellis *trellis;
+	int *punc;
+	int16_t **paths;
+
+	void (*metric_func)(const int8_t *, const int16_t *,
+		int16_t *, int16_t *, int);
+};
+
+/* Accessor calls */
+static inline int conv_code_recursive(const struct osmo_conv_code *code)
+{
+	return code->next_term_output ? 1 : 0;
+}
+
+/* Left shift and mask for finding the previous state */
+static unsigned vstate_lshift(unsigned reg, int k, int val)
+{
+	unsigned mask;
+
+	if (k == 5)
+		mask = 0x0e;
+	else if (k == 7)
+		mask = 0x3e;
+	else
+		mask = 0;
+
+	return ((reg << 1) & mask) | val;
+}
+
+/* Bit endian manipulators */
+static inline unsigned bitswap2(unsigned v)
+{
+	return ((v & 0x02) >> 1) | ((v & 0x01) << 1);
+}
+
+static inline unsigned bitswap3(unsigned v)
+{
+	return ((v & 0x04) >> 2) | ((v & 0x02) >> 0) |
+		((v & 0x01) << 2);
+}
+
+static inline unsigned bitswap4(unsigned v)
+{
+	return ((v & 0x08) >> 3) | ((v & 0x04) >> 1) |
+		((v & 0x02) << 1) | ((v & 0x01) << 3);
+}
+
+static inline unsigned bitswap5(unsigned v)
+{
+	return ((v & 0x10) >> 4) | ((v & 0x08) >> 2) | ((v & 0x04) >> 0) |
+		((v & 0x02) << 2) | ((v & 0x01) << 4);
+}
+
+static inline unsigned bitswap6(unsigned v)
+{
+	return ((v & 0x20) >> 5) | ((v & 0x10) >> 3) | ((v & 0x08) >> 1) |
+		((v & 0x04) << 1) | ((v & 0x02) << 3) | ((v & 0x01) << 5);
+}
+
+static unsigned bitswap(unsigned v, unsigned n)
+{
+	switch (n) {
+	case 1:
+		return v;
+	case 2:
+		return bitswap2(v);
+	case 3:
+		return bitswap3(v);
+	case 4:
+		return bitswap4(v);
+	case 5:
+		return bitswap5(v);
+	case 6:
+		return bitswap6(v);
+	default:
+		return 0;
+	}
+}
+
+/* Generate non-recursive state output from generator state table
+ * Note that the shift register moves right (i.e. the most recent bit is
+ * shifted into the register at k-1 bit of the register), which is typical
+ * textbook representation. The API transition table expects the most recent
+ * bit in the low order bit, or left shift. A bitswap operation is required
+ * to accommodate the difference.
+ */
+static unsigned gen_output(struct vstate *state, int val,
+	const struct osmo_conv_code *code)
+{
+	unsigned out, prev;
+
+	prev = bitswap(state->prev[0], code->K - 1);
+	out = code->next_output[prev][val];
+	out = bitswap(out, code->N);
+
+	return out;
+}
+
+/* Populate non-recursive trellis state
+ * For a given state defined by the k-1 length shift register, find the
+ * value of the input bit that drove the trellis to that state. Also
+ * generate the N outputs of the generator polynomial at that state.
+ */
+static int gen_state_info(uint8_t *val, unsigned reg,
+	int16_t *output, const struct osmo_conv_code *code)
+{
+	int i;
+	unsigned out;
+	struct vstate state;
+
+	/* Previous '0' state */
+	state.state = reg;
+	state.prev[0] = vstate_lshift(reg, code->K, 0);
+	state.prev[1] = vstate_lshift(reg, code->K, 1);
+
+	*val = (reg >> (code->K - 2)) & 0x01;
+
+	/* Transition output */
+	out = gen_output(&state, *val, code);
+
+	/* Unpack to NRZ */
+	for (i = 0; i < code->N; i++)
+		output[i] = BIT2NRZ(out, i);
+
+	return 0;
+}
+
+/* Generate recursive state output from generator state table */
+static unsigned gen_recursive_output(struct vstate *state,
+	uint8_t *val, unsigned reg,
+	const struct osmo_conv_code *code, int pos)
+{
+	int val0, val1;
+	unsigned out, prev;
+
+	/* Previous '0' state */
+	prev = vstate_lshift(reg, code->K, 0);
+	prev = bitswap(prev, code->K - 1);
+
+	/* Input value */
+	val0 = (reg >> (code->K - 2)) & 0x01;
+	val1 = (code->next_term_output[prev] >> pos) & 0x01;
+	*val = val0 == val1 ? 0 : 1;
+
+	/* Wrapper for osmocom state access */
+	prev = bitswap(state->prev[0], code->K - 1);
+
+	/* Compute the transition output */
+	out = code->next_output[prev][*val];
+	out = bitswap(out, code->N);
+
+	return out;
+}
+
+/* Populate recursive trellis state
+ * The bit position of the systematic bit is not explicitly marked by the
+ * API, so it must be extracted from the generator table. Otherwise,
+ * populate the trellis similar to the non-recursive version.
+ * Non-systematic recursive codes are not supported.
+ */
+static int gen_recursive_state_info(uint8_t *val,
+	unsigned reg, int16_t *output, const struct osmo_conv_code *code)
+{
+	int i, j, pos = -1;
+	int ns = NUM_STATES(code->K);
+	unsigned out;
+	struct vstate state;
+
+	/* Previous '0' and '1' states */
+	state.state = reg;
+	state.prev[0] = vstate_lshift(reg, code->K, 0);
+	state.prev[1] = vstate_lshift(reg, code->K, 1);
+
+	/* Find recursive bit location */
+	for (i = 0; i < code->N; i++) {
+		for (j = 0; j < ns; j++) {
+			if ((code->next_output[j][0] >> i) & 0x01)
+				break;
+		}
+
+		if (j == ns) {
+			pos = i;
+			break;
+		}
+	}
+
+	/* Non-systematic recursive code not supported */
+	if (pos < 0)
+		return -EPROTO;
+
+	/* Transition output */
+	out = gen_recursive_output(&state, val, reg, code, pos);
+
+	/* Unpack to NRZ */
+	for (i = 0; i < code->N; i++)
+		output[i] = BIT2NRZ(out, i);
+
+	return 0;
+}
+
+/* Release the trellis */
+static void free_trellis(struct vtrellis *trellis)
+{
+	if (!trellis)
+		return;
+
+	vdec_free(trellis->outputs);
+	vdec_free(trellis->sums);
+	free(trellis->vals);
+	free(trellis);
+}
+
+/* Allocate and initialize the trellis object
+ * Initialization consists of generating the outputs and output value of a
+ * given state. Due to trellis symmetry and anti-symmetry, only one of the
+ * transition paths is utilized by the butterfly operation in the forward
+ * recursion, so only one set of N outputs is required per state variable.
+ */
+static struct vtrellis *generate_trellis(const struct osmo_conv_code *code)
+{
+	int i, rc = -1;
+	struct vtrellis *trellis;
+	int16_t *outputs;
+
+	int ns = NUM_STATES(code->K);
+	int recursive = conv_code_recursive(code);
+	int olen = (code->N == 2) ? 2 : 4;
+
+	trellis = (struct vtrellis *) calloc(1, sizeof(struct vtrellis));
+	if (!trellis)
+		goto fail;
+
+	trellis->num_states = ns;
+	trellis->sums =	vdec_malloc(ns);
+	trellis->outputs = vdec_malloc(ns * olen);
+	trellis->vals = (uint8_t *) malloc(ns * sizeof(uint8_t));
+
+	if (!trellis->sums || !trellis->outputs || !trellis->vals)
+		goto fail;
+
+	/* Populate the trellis state objects */
+	for (i = 0; i < ns; i++) {
+		outputs = &trellis->outputs[olen * i];
+		if (recursive) {
+			rc = gen_recursive_state_info(&trellis->vals[i],
+				i, outputs, code);
+		} else {
+			rc = gen_state_info(&trellis->vals[i],
+				i, outputs, code);
+		}
+	}
+
+	if (rc < 0)
+		goto fail;
+
+	return trellis;
+
+fail:
+	free_trellis(trellis);
+	return NULL;
+}
+
+/* Reset decoder
+ * Set accumulated path metrics to zero. For termination other than
+ * tail-biting, initialize the zero state as the encoder starting state.
+ * Initialize with the maximum accumulated sum at length equal to the
+ * constraint length.
+ */
+static void reset_decoder(struct vdecoder *dec, int term)
+{
+	int ns = dec->trellis->num_states;
+
+	memset(dec->trellis->sums, 0, sizeof(int16_t) * ns);
+
+	if (term != CONV_TERM_TAIL_BITING)
+		dec->trellis->sums[0] = INT8_MAX * dec->n * dec->k;
+}
+
+static void _traceback(struct vdecoder *dec,
+	unsigned state, uint8_t *out, int len)
+{
+	int i;
+	unsigned path;
+
+	for (i = len - 1; i >= 0; i--) {
+		path = dec->paths[i][state] + 1;
+		out[i] = dec->trellis->vals[state];
+		state = vstate_lshift(state, dec->k, path);
+	}
+}
+
+static void _traceback_rec(struct vdecoder *dec,
+	unsigned state, uint8_t *out, int len)
+{
+	int i;
+	unsigned path;
+
+	for (i = len - 1; i >= 0; i--) {
+		path = dec->paths[i][state] + 1;
+		out[i] = path ^ dec->trellis->vals[state];
+		state = vstate_lshift(state, dec->k, path);
+	}
+}
+
+/* Traceback and generate decoded output
+ * Find the largest accumulated path metric at the final state except for
+ * the zero terminated case, where we assume the final state is always zero.
+ */
+static int traceback(struct vdecoder *dec, uint8_t *out, int term, int len)
+{
+	int i, sum, max = -1;
+	unsigned path, state = 0;
+
+	if (term != CONV_TERM_FLUSH) {
+		for (i = 0; i < dec->trellis->num_states; i++) {
+			sum = dec->trellis->sums[i];
+			if (sum > max) {
+				max = sum;
+				state = i;
+			}
+		}
+
+		if (max < 0)
+			return -EPROTO;
+	}
+
+	for (i = dec->len - 1; i >= len; i--) {
+		path = dec->paths[i][state] + 1;
+		state = vstate_lshift(state, dec->k, path);
+	}
+
+	if (dec->recursive)
+		_traceback_rec(dec, state, out, len);
+	else
+		_traceback(dec, state, out, len);
+
+	return 0;
+}
+
+/* Release decoder object */
+static void free_vdec(struct vdecoder *dec)
+{
+	if (!dec)
+		return;
+
+	free_trellis(dec->trellis);
+
+	if (dec->paths != NULL) {
+		vdec_free(dec->paths[0]);
+		free(dec->paths);
+	}
+
+	free(dec);
+}
+
+/* Allocate decoder object
+ * Subtract the constraint length K on the normalization interval to
+ * accommodate the initialization path metric at state zero.
+ */
+static struct vdecoder *alloc_vdec(const struct osmo_conv_code *code)
+{
+	int i, ns;
+	struct vdecoder *dec;
+
+	ns = NUM_STATES(code->K);
+
+	dec = (struct vdecoder *) calloc(1, sizeof(struct vdecoder));
+	dec->n = code->N;
+	dec->k = code->K;
+	dec->recursive = conv_code_recursive(code);
+	dec->intrvl = INT16_MAX / (dec->n * INT8_MAX) - dec->k;
+
+	if (dec->k == 5) {
+		switch (dec->n) {
+		case 2:
+			dec->metric_func = osmo_conv_metrics_k5_n2;
+			break;
+		case 3:
+			dec->metric_func = osmo_conv_metrics_k5_n3;
+			break;
+		case 4:
+			dec->metric_func = osmo_conv_metrics_k5_n4;
+			break;
+		default:
+			goto fail;
+		}
+	} else if (dec->k == 7) {
+		switch (dec->n) {
+		case 2:
+			dec->metric_func = osmo_conv_metrics_k7_n2;
+			break;
+		case 3:
+			dec->metric_func = osmo_conv_metrics_k7_n3;
+			break;
+		case 4:
+			dec->metric_func = osmo_conv_metrics_k7_n4;
+			break;
+		default:
+			goto fail;
+		}
+	} else {
+		goto fail;
+	}
+
+	if (code->term == CONV_TERM_FLUSH)
+		dec->len = code->len + code->K - 1;
+	else
+		dec->len = code->len;
+
+	dec->trellis = generate_trellis(code);
+	if (!dec->trellis)
+		goto fail;
+
+	dec->paths = (int16_t **) malloc(sizeof(int16_t *) * dec->len);
+	if (!dec->paths)
+		goto fail;
+
+	dec->paths[0] = vdec_malloc(ns * dec->len);
+	if (!dec->paths[0])
+		goto fail;
+
+	for (i = 1; i < dec->len; i++)
+		dec->paths[i] = &dec->paths[0][i * ns];
+
+	return dec;
+
+fail:
+	free_vdec(dec);
+	return NULL;
+}
+
+/* Depuncture sequence with nagative value terminated puncturing matrix */
+static int depuncture(const int8_t *in, const int *punc, int8_t *out, int len)
+{
+	int i, n = 0, m = 0;
+
+	for (i = 0; i < len; i++) {
+		if (i == punc[n]) {
+			out[i] = 0;
+			n++;
+			continue;
+		}
+
+		out[i] = in[m++];
+	}
+
+	return 0;
+}
+
+/* Forward trellis recursion
+ * Generate branch metrics and path metrics with a combined function. Only
+ * accumulated path metric sums and path selections are stored. Normalize on
+ * the interval specified by the decoder.
+ */
+static void forward_traverse(struct vdecoder *dec, const int8_t *seq)
+{
+	struct vtrellis *trellis = dec->trellis;
+	int i;
+
+	for (i = 0; i < dec->len; i++) {
+		dec->metric_func(&seq[dec->n * i],
+			trellis->outputs,
+			trellis->sums,
+			dec->paths[i],
+			!(i % dec->intrvl));
+	}
+}
+
+/* Convolutional decode with a decoder object
+ * Initial puncturing run if necessary followed by the forward recursion.
+ * For tail-biting perform a second pass before running the backward
+ * traceback operation.
+ */
+static int conv_decode(struct vdecoder *dec, const int8_t *seq,
+	const int *punc, uint8_t *out, int len, int term)
+{
+	int8_t depunc[dec->len * dec->n];
+
+	reset_decoder(dec, term);
+
+	if (punc) {
+		depuncture(seq, punc, depunc, dec->len * dec->n);
+		seq = depunc;
+	}
+
+	/* Propagate through the trellis with interval normalization */
+	forward_traverse(dec, seq);
+
+	if (term == CONV_TERM_TAIL_BITING)
+		forward_traverse(dec, seq);
+
+	return traceback(dec, out, term, len);
+}
+
+static void osmo_conv_init(void)
+{
+	init_complete = 1;
+
+#ifdef HAVE___BUILTIN_CPU_SUPPORTS
+	/* Detect CPU capabilities */
+	#ifdef HAVE_AVX2
+		avx2_supported = __builtin_cpu_supports("avx2");
+	#endif
+
+	#ifdef HAVE_SSE3
+		sse3_supported = __builtin_cpu_supports("sse3");
+	#endif
+
+	#ifdef HAVE_SSE4_1
+		sse41_supported = __builtin_cpu_supports("sse4.1");
+	#endif
+#endif
+
+/**
+ * Usage of curly braces is mandatory,
+ * because we use multi-line define.
+ */
+#if defined(HAVE_SSE3) && defined(HAVE_AVX2)
+	if (sse3_supported && avx2_supported) {
+		INIT_POINTERS(sse_avx);
+	} else if (sse3_supported) {
+		INIT_POINTERS(sse);
+	} else {
+		INIT_POINTERS(gen);
+	}
+#elif defined(HAVE_SSE3)
+	if (sse3_supported) {
+		INIT_POINTERS(sse);
+	} else {
+		INIT_POINTERS(gen);
+	}
+#else
+	INIT_POINTERS(gen);
+#endif
+}
+
+/* All-in-one Viterbi decoding  */
+int osmo_conv_decode_acc(const struct osmo_conv_code *code,
+	const sbit_t *input, ubit_t *output)
+{
+	int rc;
+	struct vdecoder *vdec;
+
+	if (!init_complete)
+		osmo_conv_init();
+
+	if ((code->N < 2) || (code->N > 4) || (code->len < 1) ||
+		((code->K != 5) && (code->K != 7)))
+		return -EINVAL;
+
+	vdec = alloc_vdec(code);
+	if (!vdec)
+		return -EFAULT;
+
+	rc = conv_decode(vdec, input, code->puncture,
+		output, code->len, code->term);
+
+	free_vdec(vdec);
+
+	return rc;
+}
diff --git a/src/conv_acc_generic.c b/src/conv_acc_generic.c
new file mode 100644
index 00000000..60426685
--- /dev/null
+++ b/src/conv_acc_generic.c
@@ -0,0 +1,207 @@
+/*
+ * Viterbi decoder
+ *
+ * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
+ *
+ * All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Add-Compare-Select (ACS-Butterfly)
+ * Compute 4 accumulated path metrics and 4 path selections. Note that path
+ * selections are store as -1 and 0 rather than 0 and 1. This is to match
+ * the output format of the SSE packed compare instruction 'pmaxuw'.
+ */
+
+static void acs_butterfly(int state, int num_states,
+	int16_t metric, int16_t *sum,
+	int16_t *new_sum, int16_t *path)
+{
+	int state0, state1;
+	int sum0, sum1, sum2, sum3;
+
+	state0 = *(sum + (2 * state + 0));
+	state1 = *(sum + (2 * state + 1));
+
+	sum0 = state0 + metric;
+	sum1 = state1 - metric;
+	sum2 = state0 - metric;
+	sum3 = state1 + metric;
+
+	if (sum0 >= sum1) {
+		*new_sum = sum0;
+		*path = -1;
+	} else {
+		*new_sum = sum1;
+		*path = 0;
+	}
+
+	if (sum2 >= sum3) {
+		*(new_sum + num_states / 2) = sum2;
+		*(path + num_states / 2) = -1;
+	} else {
+		*(new_sum + num_states / 2) = sum3;
+		*(path + num_states / 2) = 0;
+	}
+}
+
+/* Branch metrics unit N=2 */
+static void gen_branch_metrics_n2(int num_states, const int8_t *seq,
+	const int16_t *out, int16_t *metrics)
+{
+	int i;
+
+	for (i = 0; i < num_states / 2; i++) {
+		metrics[i] = seq[0] * out[2 * i + 0] +
+			seq[1] * out[2 * i + 1];
+	}
+}
+
+/* Branch metrics unit N=3 */
+static void gen_branch_metrics_n3(int num_states, const int8_t *seq,
+	const int16_t *out, int16_t *metrics)
+{
+	int i;
+
+	for (i = 0; i < num_states / 2; i++) {
+		metrics[i] = seq[0] * out[4 * i + 0] +
+			seq[1] * out[4 * i + 1] +
+			seq[2] * out[4 * i + 2];
+	}
+}
+
+/* Branch metrics unit N=4 */
+static void gen_branch_metrics_n4(int num_states, const int8_t *seq,
+	const int16_t *out, int16_t *metrics)
+{
+	int i;
+
+	for (i = 0; i < num_states / 2; i++) {
+		metrics[i] = seq[0] * out[4 * i + 0] +
+			seq[1] * out[4 * i + 1] +
+			seq[2] * out[4 * i + 2] +
+			seq[3] * out[4 * i + 3];
+	}
+}
+
+/* Path metric unit */
+static void gen_path_metrics(int num_states, int16_t *sums,
+	int16_t *metrics, int16_t *paths, int norm)
+{
+	int i;
+	int16_t min;
+	int16_t new_sums[num_states];
+
+	for (i = 0; i < num_states / 2; i++)
+		acs_butterfly(i, num_states, metrics[i],
+			sums, &new_sums[i], &paths[i]);
+
+	if (norm) {
+		min = new_sums[0];
+
+		for (i = 1; i < num_states; i++)
+			if (new_sums[i] < min)
+				min = new_sums[i];
+
+		for (i = 0; i < num_states; i++)
+			new_sums[i] -= min;
+	}
+
+	memcpy(sums, new_sums, num_states * sizeof(int16_t));
+}
+
+/* Not-aligned Memory Allocator */
+__attribute__ ((visibility("hidden")))
+int16_t *osmo_conv_gen_vdec_malloc(size_t n)
+{
+	return (int16_t *) malloc(sizeof(int16_t) * n);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_vdec_free(int16_t *ptr)
+{
+	free(ptr);
+}
+
+/* 16-state branch-path metrics units (K=5) */
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	int16_t metrics[8];
+
+	gen_branch_metrics_n2(16, seq, out, metrics);
+	gen_path_metrics(16, sums, metrics, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k5_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	int16_t metrics[8];
+
+	gen_branch_metrics_n3(16, seq, out, metrics);
+	gen_path_metrics(16, sums, metrics, paths, norm);
+
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k5_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	int16_t metrics[8];
+
+	gen_branch_metrics_n4(16, seq, out, metrics);
+	gen_path_metrics(16, sums, metrics, paths, norm);
+
+}
+
+/* 64-state branch-path metrics units (K=7) */
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k7_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	int16_t metrics[32];
+
+	gen_branch_metrics_n2(64, seq, out, metrics);
+	gen_path_metrics(64, sums, metrics, paths, norm);
+
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k7_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	int16_t metrics[32];
+
+	gen_branch_metrics_n3(64, seq, out, metrics);
+	gen_path_metrics(64, sums, metrics, paths, norm);
+
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	int16_t metrics[32];
+
+	gen_branch_metrics_n4(64, seq, out, metrics);
+	gen_path_metrics(64, sums, metrics, paths, norm);
+}
diff --git a/src/conv_acc_sse.c b/src/conv_acc_sse.c
new file mode 100644
index 00000000..9cbcc2f1
--- /dev/null
+++ b/src/conv_acc_sse.c
@@ -0,0 +1,129 @@
+/*
+ * Intel SSE Viterbi decoder
+ *
+ * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
+ *
+ * All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdint.h>
+#include "config.h"
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+
+#if defined(HAVE_SSE4_1)
+#include <smmintrin.h>
+#endif
+
+#define SSE_ALIGN 16
+
+/* Broadcast 16-bit integer
+ * Repeat the low 16-bit integer to all elements of the 128-bit SSE
+ * register. Only AVX2 has a dedicated broadcast instruction; use repeat
+ * unpacks for SSE only architectures. This is a destructive operation and
+ * the source register is overwritten.
+ *
+ * Input:
+ * M0 - Low 16-bit element is read
+ *
+ * Output:
+ * M0 - Contains broadcasted values
+ */
+#define SSE_BROADCAST(M0) \
+{ \
+	M0 = _mm_unpacklo_epi16(M0, M0); \
+	M0 = _mm_unpacklo_epi32(M0, M0); \
+	M0 = _mm_unpacklo_epi64(M0, M0); \
+}
+
+/**
+ * Include common SSE implementation
+ */
+#include <conv_acc_sse_impl.h>
+
+/* Aligned Memory Allocator
+ * SSE requires 16-byte memory alignment. We store relevant trellis values
+ * (accumulated sums, outputs, and path decisions) as 16 bit signed integers
+ * so the allocated memory is casted as such.
+ */
+__attribute__ ((visibility("hidden")))
+int16_t *osmo_conv_sse_vdec_malloc(size_t n)
+{
+	return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_vdec_free(int16_t *ptr)
+{
+	_mm_free(ptr);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_metrics_k5_n2(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
+
+	_sse_metrics_k5_n2(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_metrics_k5_n3(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
+
+	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_metrics_k5_n4(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
+
+	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_metrics_k7_n2(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
+
+	_sse_metrics_k7_n2(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_metrics_k7_n3(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
+
+	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_metrics_k7_n4(const int8_t *val, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
+
+	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
+}
diff --git a/src/conv_acc_sse_avx.c b/src/conv_acc_sse_avx.c
new file mode 100644
index 00000000..5d9a82b7
--- /dev/null
+++ b/src/conv_acc_sse_avx.c
@@ -0,0 +1,129 @@
+/*
+ * Intel SSE + AVX Viterbi decoder
+ *
+ * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
+ *
+ * All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdint.h>
+#include "config.h"
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+
+#if defined(HAVE_SSE4_1)
+#include <smmintrin.h>
+#endif
+
+#define SSE_ALIGN 16
+
+
+/* Broadcast 16-bit integer
+ * Repeat the low 16-bit integer to all elements of the 128-bit SSE
+ * register. Only AVX2 has a dedicated broadcast instruction; use repeat
+ * unpacks for SSE only architectures. This is a destructive operation and
+ * the source register is overwritten.
+ *
+ * Input:
+ * M0 - Low 16-bit element is read
+ *
+ * Output:
+ * M0 - Contains broadcasted values
+ */
+#define SSE_BROADCAST(M0) \
+{ \
+	M0 = _mm_broadcastw_epi16(M0); \
+}
+
+/**
+ * Include common SSE implementation
+ */
+#include <conv_acc_sse_impl.h>
+
+/* Aligned Memory Allocator
+ * SSE requires 16-byte memory alignment. We store relevant trellis values
+ * (accumulated sums, outputs, and path decisions) as 16 bit signed integers
+ * so the allocated memory is casted as such.
+ */
+__attribute__ ((visibility("hidden")))
+int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n)
+{
+	return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_vdec_free(int16_t *ptr)
+{
+	_mm_free(ptr);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
+
+	_sse_metrics_k5_n2(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
+
+	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
+
+	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
+
+	_sse_metrics_k7_n2(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
+
+	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
+
+	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
+}
diff --git a/src/conv_acc_sse_impl.h b/src/conv_acc_sse_impl.h
new file mode 100644
index 00000000..7d48c942
--- /dev/null
+++ b/src/conv_acc_sse_impl.h
@@ -0,0 +1,495 @@
+/*
+ * Intel SSE Viterbi decoder
+ *
+ * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
+ *
+ * All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+extern int sse41_supported;
+
+/* Octo-Viterbi butterfly
+ * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
+ * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
+ * Two intermediate registers are used and results are set in the upper 4
+ * registers.
+ *
+ * Input:
+ * M0 - Path metrics 0 (packed 16-bit integers)
+ * M1 - Path metrics 1 (packed 16-bit integers)
+ * M2 - Branch metrics (packed 16-bit integers)
+ *
+ * Output:
+ * M2 - Selected and accumulated path metrics 0
+ * M4 - Selected and accumulated path metrics 1
+ * M3 - Path selections 0
+ * M1 - Path selections 1
+ */
+#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
+{ \
+	M3 = _mm_adds_epi16(M0, M2); \
+	M4 = _mm_subs_epi16(M1, M2); \
+	M0 = _mm_subs_epi16(M0, M2); \
+	M1 = _mm_adds_epi16(M1, M2); \
+	M2 = _mm_max_epi16(M3, M4); \
+	M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
+	M4 = _mm_max_epi16(M0, M1); \
+	M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
+}
+
+/* Two lane deinterleaving K = 5:
+ * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
+ * registers. The operation summarized below. Four registers are used with
+ * the lower 2 as input and upper 2 as output.
+ *
+ * In   - 10101010 10101010 10101010 10101010
+ * Out  - 00000000 11111111 00000000 11111111
+ *
+ * Input:
+ * M0:1 - Packed 16-bit integers
+ *
+ * Output:
+ * M2:3 - Deinterleaved packed 16-bit integers
+ */
+#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
+
+#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
+{ \
+	M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
+	M0 = _mm_shuffle_epi8(M0, M2); \
+	M1 = _mm_shuffle_epi8(M1, M2); \
+	M2 = _mm_unpacklo_epi64(M0, M1); \
+	M3 = _mm_unpackhi_epi64(M0, M1); \
+}
+
+/* Two lane deinterleaving K = 7:
+ * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
+ * registers. The operation summarized below. 16 registers are used with the
+ * lower 8 as input and upper 8 as output.
+ *
+ * In   - 10101010 10101010 10101010 10101010 ...
+ * Out  - 00000000 11111111 00000000 11111111 ...
+ *
+ * Input:
+ * M0:7 - Packed 16-bit integers
+ *
+ * Output:
+ * M8:15 - Deinterleaved packed 16-bit integers
+ */
+#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
+	M8, M9, M10, M11, M12, M13, M14, M15) \
+{ \
+	M8  = _mm_set_epi8(_I8_SHUFFLE_MASK); \
+	M0  = _mm_shuffle_epi8(M0, M8); \
+	M1  = _mm_shuffle_epi8(M1, M8); \
+	M2  = _mm_shuffle_epi8(M2, M8); \
+	M3  = _mm_shuffle_epi8(M3, M8); \
+	M4  = _mm_shuffle_epi8(M4, M8); \
+	M5  = _mm_shuffle_epi8(M5, M8); \
+	M6  = _mm_shuffle_epi8(M6, M8); \
+	M7  = _mm_shuffle_epi8(M7, M8); \
+	M8  = _mm_unpacklo_epi64(M0, M1); \
+	M9  = _mm_unpackhi_epi64(M0, M1); \
+	M10 = _mm_unpacklo_epi64(M2, M3); \
+	M11 = _mm_unpackhi_epi64(M2, M3); \
+	M12 = _mm_unpacklo_epi64(M4, M5); \
+	M13 = _mm_unpackhi_epi64(M4, M5); \
+	M14 = _mm_unpacklo_epi64(M6, M7); \
+	M15 = _mm_unpackhi_epi64(M6, M7); \
+}
+
+/* Generate branch metrics N = 2:
+ * Compute 16 branch metrics from trellis outputs and input values.
+ *
+ * Input:
+ * M0:3 - 16 x 2 packed 16-bit trellis outputs
+ * M4   - Expanded and packed 16-bit input value
+ *
+ * Output:
+ * M6:7 - 16 computed 16-bit branch metrics
+ */
+#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
+{ \
+	M0 = _mm_sign_epi16(M4, M0); \
+	M1 = _mm_sign_epi16(M4, M1); \
+	M2 = _mm_sign_epi16(M4, M2); \
+	M3 = _mm_sign_epi16(M4, M3); \
+	M6 = _mm_hadds_epi16(M0, M1); \
+	M7 = _mm_hadds_epi16(M2, M3); \
+}
+
+/* Generate branch metrics N = 4:
+ * Compute 8 branch metrics from trellis outputs and input values. This
+ * macro is reused for N less than 4 where the extra soft input bits are
+ * padded.
+ *
+ * Input:
+ * M0:3 - 8 x 4 packed 16-bit trellis outputs
+ * M4   - Expanded and packed 16-bit input value
+ *
+ * Output:
+ * M5   - 8 computed 16-bit branch metrics
+ */
+#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
+{ \
+	M0 = _mm_sign_epi16(M4, M0); \
+	M1 = _mm_sign_epi16(M4, M1); \
+	M2 = _mm_sign_epi16(M4, M2); \
+	M3 = _mm_sign_epi16(M4, M3); \
+	M0 = _mm_hadds_epi16(M0, M1); \
+	M1 = _mm_hadds_epi16(M2, M3); \
+	M5 = _mm_hadds_epi16(M0, M1); \
+}
+
+/* Horizontal minimum
+ * Compute horizontal minimum of packed unsigned 16-bit integers and place
+ * result in the low 16-bit element of the source register. Only SSE 4.1
+ * has a dedicated minpos instruction. One intermediate register is used
+ * if SSE 4.1 is not available. This is a destructive operation and the
+ * source register is overwritten.
+ *
+ * Input:
+ * M0 - Packed unsigned 16-bit integers
+ *
+ * Output:
+ * M0 - Minimum value placed in low 16-bit element
+ */
+#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
+#define SSE_MINPOS(M0, M1) \
+{ \
+	if (sse41_supported) { \
+		M0 = _mm_minpos_epu16(M0); \
+	} else { \
+		M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+		M0 = _mm_min_epi16(M0, M1); \
+		M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+		M0 = _mm_min_epi16(M0, M1); \
+		M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
+		M0 = _mm_min_epi16(M0, M1); \
+	} \
+}
+#else
+#define SSE_MINPOS(M0, M1) \
+{ \
+	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+	M0 = _mm_min_epi16(M0, M1); \
+	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
+	M0 = _mm_min_epi16(M0, M1); \
+	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
+	M0 = _mm_min_epi16(M0, M1); \
+}
+#endif
+
+/* Normalize state metrics K = 5:
+ * Compute 16-wide normalization by subtracting the smallest value from
+ * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
+ * Two intermediate registers are used and normalized results are placed
+ * in the originating locations.
+ *
+ * Input:
+ * M0:1 - Path metrics 0:1 (packed 16-bit integers)
+ *
+ * Output:
+ * M0:1 - Normalized path metrics 0:1
+ */
+#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
+{ \
+	M2 = _mm_min_epi16(M0, M1); \
+	SSE_MINPOS(M2, M3) \
+	SSE_BROADCAST(M2) \
+	M0 = _mm_subs_epi16(M0, M2); \
+	M1 = _mm_subs_epi16(M1, M2); \
+}
+
+/* Normalize state metrics K = 7:
+ * Compute 64-wide normalization by subtracting the smallest value from
+ * all values. Inputs are 8 registers of accumulated sums and 4 temporary
+ * registers. Normalized results are returned in the originating locations.
+ *
+ * Input:
+ * M0:7 - Path metrics 0:7 (packed 16-bit integers)
+ *
+ * Output:
+ * M0:7 - Normalized path metrics 0:7
+ */
+#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
+{ \
+	M8  = _mm_min_epi16(M0, M1); \
+	M9  = _mm_min_epi16(M2, M3); \
+	M10 = _mm_min_epi16(M4, M5); \
+	M11 = _mm_min_epi16(M6, M7); \
+	M8  = _mm_min_epi16(M8, M9); \
+	M10 = _mm_min_epi16(M10, M11); \
+	M8  = _mm_min_epi16(M8, M10); \
+	SSE_MINPOS(M8, M9) \
+	SSE_BROADCAST(M8) \
+	M0  = _mm_subs_epi16(M0, M8); \
+	M1  = _mm_subs_epi16(M1, M8); \
+	M2  = _mm_subs_epi16(M2, M8); \
+	M3  = _mm_subs_epi16(M3, M8); \
+	M4  = _mm_subs_epi16(M4, M8); \
+	M5  = _mm_subs_epi16(M5, M8); \
+	M6  = _mm_subs_epi16(M6, M8); \
+	M7  = _mm_subs_epi16(M7, M8); \
+}
+
+/* Combined BMU/PMU (K=5, N=2)
+ * Compute branch metrics followed by path metrics for half rate 16-state
+ * trellis. 8 butterflies are computed. Accumulated path sums are not
+ * preserved and read and written into the same memory location. Normalize
+ * sums if requires.
+ */
+__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6;
+
+	/* (BMU) Load input sequence */
+	m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load trellis outputs */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+
+	/* (BMU) Compute branch metrics */
+	m0 = _mm_sign_epi16(m2, m0);
+	m1 = _mm_sign_epi16(m2, m1);
+	m2 = _mm_hadds_epi16(m0, m1);
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+
+	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
+
+	/* (PMU) Butterflies: 0-7 */
+	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
+
+	if (norm)
+		SSE_NORMALIZE_K5(m2, m6, m0, m1)
+
+	_mm_store_si128((__m128i *) &sums[0], m2);
+	_mm_store_si128((__m128i *) &sums[8], m6);
+	_mm_store_si128((__m128i *) &paths[0], m5);
+	_mm_store_si128((__m128i *) &paths[8], m4);
+}
+
+/* Combined BMU/PMU (K=5, N=3 and N=4)
+ * Compute branch metrics followed by path metrics for 16-state and rates
+ * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
+ * values at a time, and extra values should be set to zero for rates other
+ * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
+ * dedicated implementation of rate 1/2.
+ */
+__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6;
+
+	/* (BMU) Load input sequence */
+	m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load trellis outputs */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+	m2 = _mm_load_si128((__m128i *) &out[16]);
+	m3 = _mm_load_si128((__m128i *) &out[24]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+
+	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
+
+	/* (PMU) Butterflies: 0-7 */
+	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
+
+	if (norm)
+		SSE_NORMALIZE_K5(m2, m6, m0, m1)
+
+	_mm_store_si128((__m128i *) &sums[0], m2);
+	_mm_store_si128((__m128i *) &sums[8], m6);
+	_mm_store_si128((__m128i *) &paths[0], m5);
+	_mm_store_si128((__m128i *) &paths[8], m4);
+}
+
+/* Combined BMU/PMU (K=7, N=2)
+ * Compute branch metrics followed by path metrics for half rate 64-state
+ * trellis. 32 butterfly operations are computed. Deinterleaving path
+ * metrics requires usage of the full SSE register file, so separate sums
+ * before computing branch metrics to avoid register spilling.
+ */
+__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
+		m9, m10, m11, m12, m13, m14, m15;
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+	m2 = _mm_load_si128((__m128i *) &sums[16]);
+	m3 = _mm_load_si128((__m128i *) &sums[24]);
+	m4 = _mm_load_si128((__m128i *) &sums[32]);
+	m5 = _mm_load_si128((__m128i *) &sums[40]);
+	m6 = _mm_load_si128((__m128i *) &sums[48]);
+	m7 = _mm_load_si128((__m128i *) &sums[56]);
+
+	/* (PMU) Deinterleave to even-odd registers */
+	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
+			    m8, m9, m10, m11, m12, m13, m14, m15)
+
+	/* (BMU) Load input symbols */
+	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load trellis outputs */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+	m2 = _mm_load_si128((__m128i *) &out[16]);
+	m3 = _mm_load_si128((__m128i *) &out[24]);
+
+	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
+
+	m0 = _mm_load_si128((__m128i *) &out[32]);
+	m1 = _mm_load_si128((__m128i *) &out[40]);
+	m2 = _mm_load_si128((__m128i *) &out[48]);
+	m3 = _mm_load_si128((__m128i *) &out[56]);
+
+	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
+
+	/* (PMU) Butterflies: 0-15 */
+	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
+	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
+
+	_mm_store_si128((__m128i *) &paths[0], m0);
+	_mm_store_si128((__m128i *) &paths[8], m2);
+	_mm_store_si128((__m128i *) &paths[32], m9);
+	_mm_store_si128((__m128i *) &paths[40], m11);
+
+	/* (PMU) Butterflies: 17-31 */
+	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
+	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
+
+	_mm_store_si128((__m128i *) &paths[16], m0);
+	_mm_store_si128((__m128i *) &paths[24], m9);
+	_mm_store_si128((__m128i *) &paths[48], m13);
+	_mm_store_si128((__m128i *) &paths[56], m15);
+
+	if (norm)
+		SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
+				 m7, m11, m0, m8, m9, m10)
+
+	_mm_store_si128((__m128i *) &sums[0], m4);
+	_mm_store_si128((__m128i *) &sums[8], m5);
+	_mm_store_si128((__m128i *) &sums[16], m6);
+	_mm_store_si128((__m128i *) &sums[24], m7);
+	_mm_store_si128((__m128i *) &sums[32], m1);
+	_mm_store_si128((__m128i *) &sums[40], m3);
+	_mm_store_si128((__m128i *) &sums[48], m2);
+	_mm_store_si128((__m128i *) &sums[56], m11);
+}
+
+/* Combined BMU/PMU (K=7, N=3 and N=4)
+ * Compute branch metrics followed by path metrics for half rate 64-state
+ * trellis. 32 butterfly operations are computed. Deinterleave path
+ * metrics before computing branch metrics as in the half rate case.
+ */
+__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	__m128i m0, m1, m2, m3, m4, m5, m6, m7;
+	__m128i m8, m9, m10, m11, m12, m13, m14, m15;
+
+	/* (PMU) Load accumulated path metrics */
+	m0 = _mm_load_si128((__m128i *) &sums[0]);
+	m1 = _mm_load_si128((__m128i *) &sums[8]);
+	m2 = _mm_load_si128((__m128i *) &sums[16]);
+	m3 = _mm_load_si128((__m128i *) &sums[24]);
+	m4 = _mm_load_si128((__m128i *) &sums[32]);
+	m5 = _mm_load_si128((__m128i *) &sums[40]);
+	m6 = _mm_load_si128((__m128i *) &sums[48]);
+	m7 = _mm_load_si128((__m128i *) &sums[56]);
+
+	/* (PMU) Deinterleave into even and odd packed registers */
+	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
+			    m8, m9, m10, m11, m12, m13, m14, m15)
+
+	/* (BMU) Load and expand 8-bit input out to 16-bits */
+	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
+
+	/* (BMU) Load and compute branch metrics */
+	m0 = _mm_load_si128((__m128i *) &out[0]);
+	m1 = _mm_load_si128((__m128i *) &out[8]);
+	m2 = _mm_load_si128((__m128i *) &out[16]);
+	m3 = _mm_load_si128((__m128i *) &out[24]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
+
+	m0 = _mm_load_si128((__m128i *) &out[32]);
+	m1 = _mm_load_si128((__m128i *) &out[40]);
+	m2 = _mm_load_si128((__m128i *) &out[48]);
+	m3 = _mm_load_si128((__m128i *) &out[56]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
+
+	m0 = _mm_load_si128((__m128i *) &out[64]);
+	m1 = _mm_load_si128((__m128i *) &out[72]);
+	m2 = _mm_load_si128((__m128i *) &out[80]);
+	m3 = _mm_load_si128((__m128i *) &out[88]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
+
+	m0 = _mm_load_si128((__m128i *) &out[96]);
+	m1 = _mm_load_si128((__m128i *) &out[104]);
+	m2 = _mm_load_si128((__m128i *) &out[112]);
+	m3 = _mm_load_si128((__m128i *) &out[120]);
+
+	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
+
+	/* (PMU) Butterflies: 0-15 */
+	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
+	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
+
+	_mm_store_si128((__m128i *) &paths[0], m0);
+	_mm_store_si128((__m128i *) &paths[8], m2);
+	_mm_store_si128((__m128i *) &paths[32], m9);
+	_mm_store_si128((__m128i *) &paths[40], m11);
+
+	/* (PMU) Butterflies: 17-31 */
+	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
+	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
+
+	_mm_store_si128((__m128i *) &paths[16], m0);
+	_mm_store_si128((__m128i *) &paths[24], m9);
+	_mm_store_si128((__m128i *) &paths[48], m13);
+	_mm_store_si128((__m128i *) &paths[56], m15);
+
+	if (norm)
+		SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
+				 m7, m11, m0, m8, m9, m10)
+
+	_mm_store_si128((__m128i *) &sums[0], m4);
+	_mm_store_si128((__m128i *) &sums[8], m5);
+	_mm_store_si128((__m128i *) &sums[16], m6);
+	_mm_store_si128((__m128i *) &sums[24], m7);
+	_mm_store_si128((__m128i *) &sums[32], m1);
+	_mm_store_si128((__m128i *) &sums[40], m3);
+	_mm_store_si128((__m128i *) &sums[48], m2);
+	_mm_store_si128((__m128i *) &sums[56], m11);
+}
diff --git a/src/viterbi.c b/src/viterbi.c
deleted file mode 100644
index 308cfe07..00000000
--- a/src/viterbi.c
+++ /dev/null
@@ -1,728 +0,0 @@
-/*
- * Viterbi decoder
- *
- * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
- *
- * All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include "config.h"
-
-#include <osmocom/core/conv.h>
-
-#define BIT2NRZ(REG,N)	(((REG >> N) & 0x01) * 2 - 1) * -1
-#define NUM_STATES(K)	(K == 7 ? 64 : 16)
-
-#define INIT_POINTERS(simd) \
-{ \
-	osmo_conv_metrics_k5_n2 = osmo_conv_##simd##_metrics_k5_n2; \
-	osmo_conv_metrics_k5_n3 = osmo_conv_##simd##_metrics_k5_n3; \
-	osmo_conv_metrics_k5_n4 = osmo_conv_##simd##_metrics_k5_n4; \
-	osmo_conv_metrics_k7_n2 = osmo_conv_##simd##_metrics_k7_n2; \
-	osmo_conv_metrics_k7_n3 = osmo_conv_##simd##_metrics_k7_n3; \
-	osmo_conv_metrics_k7_n4 = osmo_conv_##simd##_metrics_k7_n4; \
-	vdec_malloc = &osmo_conv_##simd##_vdec_malloc; \
-	vdec_free = &osmo_conv_##simd##_vdec_free; \
-}
-
-static int init_complete = 0;
-
-__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
-__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
-__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
-
-/**
- * These pointers are being initialized at runtime by the
- * osmo_conv_init() depending on supported SIMD extensions.
- */
-static int16_t *(*vdec_malloc)(size_t n);
-static void (*vdec_free)(int16_t *ptr);
-
-void (*osmo_conv_metrics_k5_n2)(const int8_t *seq,
-	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
-void (*osmo_conv_metrics_k5_n3)(const int8_t *seq,
-	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
-void (*osmo_conv_metrics_k5_n4)(const int8_t *seq,
-	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
-void (*osmo_conv_metrics_k7_n2)(const int8_t *seq,
-	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
-void (*osmo_conv_metrics_k7_n3)(const int8_t *seq,
-	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
-void (*osmo_conv_metrics_k7_n4)(const int8_t *seq,
-	const int16_t *out, int16_t *sums, int16_t *paths, int norm);
-
-/* Forward malloc wrappers */
-int16_t *osmo_conv_gen_vdec_malloc(size_t n);
-void osmo_conv_gen_vdec_free(int16_t *ptr);
-
-#if defined(HAVE_SSE3)
-int16_t *osmo_conv_sse_vdec_malloc(size_t n);
-void osmo_conv_sse_vdec_free(int16_t *ptr);
-#endif
-
-#if defined(HAVE_SSE3) && defined(HAVE_AVX2)
-int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n);
-void osmo_conv_sse_avx_vdec_free(int16_t *ptr);
-#endif
-
-/* Forward Metric Units */
-void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_gen_metrics_k5_n3(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_gen_metrics_k5_n4(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_gen_metrics_k7_n2(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_gen_metrics_k7_n3(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-
-#if defined(HAVE_SSE3)
-void osmo_conv_sse_metrics_k5_n2(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_sse_metrics_k5_n3(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_sse_metrics_k5_n4(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_sse_metrics_k7_n2(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_sse_metrics_k7_n3(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_sse_metrics_k7_n4(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-#endif
-
-#if defined(HAVE_SSE3) && defined(HAVE_AVX2)
-void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *seq, const int16_t *out,
-	int16_t *sums, int16_t *paths, int norm);
-#endif
-
-/* Trellis State
- * state - Internal lshift register value
- * prev  - Register values of previous 0 and 1 states
- */
-struct vstate {
-	unsigned state;
-	unsigned prev[2];
-};
-
-/* Trellis Object
- * num_states - Number of states in the trellis
- * sums       - Accumulated path metrics
- * outputs    - Trellis output values
- * vals       - Input value that led to each state
- */
-struct vtrellis {
-	int num_states;
-	int16_t *sums;
-	int16_t *outputs;
-	uint8_t *vals;
-};
-
-/* Viterbi Decoder
- * n         - Code order
- * k         - Constraint length
- * len       - Horizontal length of trellis
- * recursive - Set to '1' if the code is recursive
- * intrvl    - Normalization interval
- * trellis   - Trellis object
- * punc      - Puncturing sequence
- * paths     - Trellis paths
- */
-struct vdecoder {
-	int n;
-	int k;
-	int len;
-	int recursive;
-	int intrvl;
-	struct vtrellis *trellis;
-	int *punc;
-	int16_t **paths;
-
-	void (*metric_func)(const int8_t *, const int16_t *,
-		int16_t *, int16_t *, int);
-};
-
-/* Accessor calls */
-static inline int conv_code_recursive(const struct osmo_conv_code *code)
-{
-	return code->next_term_output ? 1 : 0;
-}
-
-/* Left shift and mask for finding the previous state */
-static unsigned vstate_lshift(unsigned reg, int k, int val)
-{
-	unsigned mask;
-
-	if (k == 5)
-		mask = 0x0e;
-	else if (k == 7)
-		mask = 0x3e;
-	else
-		mask = 0;
-
-	return ((reg << 1) & mask) | val;
-}
-
-/* Bit endian manipulators */
-static inline unsigned bitswap2(unsigned v)
-{
-	return ((v & 0x02) >> 1) | ((v & 0x01) << 1);
-}
-
-static inline unsigned bitswap3(unsigned v)
-{
-	return ((v & 0x04) >> 2) | ((v & 0x02) >> 0) |
-		((v & 0x01) << 2);
-}
-
-static inline unsigned bitswap4(unsigned v)
-{
-	return ((v & 0x08) >> 3) | ((v & 0x04) >> 1) |
-		((v & 0x02) << 1) | ((v & 0x01) << 3);
-}
-
-static inline unsigned bitswap5(unsigned v)
-{
-	return ((v & 0x10) >> 4) | ((v & 0x08) >> 2) | ((v & 0x04) >> 0) |
-		((v & 0x02) << 2) | ((v & 0x01) << 4);
-}
-
-static inline unsigned bitswap6(unsigned v)
-{
-	return ((v & 0x20) >> 5) | ((v & 0x10) >> 3) | ((v & 0x08) >> 1) |
-		((v & 0x04) << 1) | ((v & 0x02) << 3) | ((v & 0x01) << 5);
-}
-
-static unsigned bitswap(unsigned v, unsigned n)
-{
-	switch (n) {
-	case 1:
-		return v;
-	case 2:
-		return bitswap2(v);
-	case 3:
-		return bitswap3(v);
-	case 4:
-		return bitswap4(v);
-	case 5:
-		return bitswap5(v);
-	case 6:
-		return bitswap6(v);
-	default:
-		return 0;
-	}
-}
-
-/* Generate non-recursive state output from generator state table
- * Note that the shift register moves right (i.e. the most recent bit is
- * shifted into the register at k-1 bit of the register), which is typical
- * textbook representation. The API transition table expects the most recent
- * bit in the low order bit, or left shift. A bitswap operation is required
- * to accommodate the difference.
- */
-static unsigned gen_output(struct vstate *state, int val,
-	const struct osmo_conv_code *code)
-{
-	unsigned out, prev;
-
-	prev = bitswap(state->prev[0], code->K - 1);
-	out = code->next_output[prev][val];
-	out = bitswap(out, code->N);
-
-	return out;
-}
-
-/* Populate non-recursive trellis state
- * For a given state defined by the k-1 length shift register, find the
- * value of the input bit that drove the trellis to that state. Also
- * generate the N outputs of the generator polynomial at that state.
- */
-static int gen_state_info(uint8_t *val, unsigned reg,
-	int16_t *output, const struct osmo_conv_code *code)
-{
-	int i;
-	unsigned out;
-	struct vstate state;
-
-	/* Previous '0' state */
-	state.state = reg;
-	state.prev[0] = vstate_lshift(reg, code->K, 0);
-	state.prev[1] = vstate_lshift(reg, code->K, 1);
-
-	*val = (reg >> (code->K - 2)) & 0x01;
-
-	/* Transition output */
-	out = gen_output(&state, *val, code);
-
-	/* Unpack to NRZ */
-	for (i = 0; i < code->N; i++)
-		output[i] = BIT2NRZ(out, i);
-
-	return 0;
-}
-
-/* Generate recursive state output from generator state table */
-static unsigned gen_recursive_output(struct vstate *state,
-	uint8_t *val, unsigned reg,
-	const struct osmo_conv_code *code, int pos)
-{
-	int val0, val1;
-	unsigned out, prev;
-
-	/* Previous '0' state */
-	prev = vstate_lshift(reg, code->K, 0);
-	prev = bitswap(prev, code->K - 1);
-
-	/* Input value */
-	val0 = (reg >> (code->K - 2)) & 0x01;
-	val1 = (code->next_term_output[prev] >> pos) & 0x01;
-	*val = val0 == val1 ? 0 : 1;
-
-	/* Wrapper for osmocom state access */
-	prev = bitswap(state->prev[0], code->K - 1);
-
-	/* Compute the transition output */
-	out = code->next_output[prev][*val];
-	out = bitswap(out, code->N);
-
-	return out;
-}
-
-/* Populate recursive trellis state
- * The bit position of the systematic bit is not explicitly marked by the
- * API, so it must be extracted from the generator table. Otherwise,
- * populate the trellis similar to the non-recursive version.
- * Non-systematic recursive codes are not supported.
- */
-static int gen_recursive_state_info(uint8_t *val,
-	unsigned reg, int16_t *output, const struct osmo_conv_code *code)
-{
-	int i, j, pos = -1;
-	int ns = NUM_STATES(code->K);
-	unsigned out;
-	struct vstate state;
-
-	/* Previous '0' and '1' states */
-	state.state = reg;
-	state.prev[0] = vstate_lshift(reg, code->K, 0);
-	state.prev[1] = vstate_lshift(reg, code->K, 1);
-
-	/* Find recursive bit location */
-	for (i = 0; i < code->N; i++) {
-		for (j = 0; j < ns; j++) {
-			if ((code->next_output[j][0] >> i) & 0x01)
-				break;
-		}
-
-		if (j == ns) {
-			pos = i;
-			break;
-		}
-	}
-
-	/* Non-systematic recursive code not supported */
-	if (pos < 0)
-		return -EPROTO;
-
-	/* Transition output */
-	out = gen_recursive_output(&state, val, reg, code, pos);
-
-	/* Unpack to NRZ */
-	for (i = 0; i < code->N; i++)
-		output[i] = BIT2NRZ(out, i);
-
-	return 0;
-}
-
-/* Release the trellis */
-static void free_trellis(struct vtrellis *trellis)
-{
-	if (!trellis)
-		return;
-
-	vdec_free(trellis->outputs);
-	vdec_free(trellis->sums);
-	free(trellis->vals);
-	free(trellis);
-}
-
-/* Allocate and initialize the trellis object
- * Initialization consists of generating the outputs and output value of a
- * given state. Due to trellis symmetry and anti-symmetry, only one of the
- * transition paths is utilized by the butterfly operation in the forward
- * recursion, so only one set of N outputs is required per state variable.
- */
-static struct vtrellis *generate_trellis(const struct osmo_conv_code *code)
-{
-	int i, rc = -1;
-	struct vtrellis *trellis;
-	int16_t *outputs;
-
-	int ns = NUM_STATES(code->K);
-	int recursive = conv_code_recursive(code);