From e4fe71c04fbe69581ba10f7b19158a4559b5a5d8 Mon Sep 17 00:00:00 2001 From: Vadim Yanitskiy Date: Mon, 19 Jun 2017 17:59:48 +0700 Subject: core/conv: use proper filenames We already have generic convolutional transcoding implementation written by Sylvain Munaut and named 'conv.c', so 'viterbi_*' names looked a bit confusing. Let's use a single naming scheme for Viterbi related code. Change-Id: I61062a8d1fbf5f5fc85b4fac58dc4e9fa8b5ef90 --- src/Makefile.am | 16 +- src/conv_acc.c | 728 +++++++++++++++++++++++++++++++++++++++++++++++ src/conv_acc_generic.c | 207 ++++++++++++++ src/conv_acc_sse.c | 129 +++++++++ src/conv_acc_sse_avx.c | 129 +++++++++ src/conv_acc_sse_impl.h | 495 ++++++++++++++++++++++++++++++++ src/viterbi.c | 728 ----------------------------------------------- src/viterbi_generic.c | 207 -------------- src/viterbi_sse.c | 129 --------- src/viterbi_sse_avx.c | 129 --------- src/viterbi_sse_common.h | 495 -------------------------------- 11 files changed, 1696 insertions(+), 1696 deletions(-) create mode 100644 src/conv_acc.c create mode 100644 src/conv_acc_generic.c create mode 100644 src/conv_acc_sse.c create mode 100644 src/conv_acc_sse_avx.c create mode 100644 src/conv_acc_sse_impl.h delete mode 100644 src/viterbi.c delete mode 100644 src/viterbi_generic.c delete mode 100644 src/viterbi_sse.c delete mode 100644 src/viterbi_sse_avx.c delete mode 100644 src/viterbi_sse_common.h (limited to 'src') diff --git a/src/Makefile.am b/src/Makefile.am index e98c623c..692b699b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -21,28 +21,28 @@ libosmocore_la_SOURCES = timer.c timer_gettimeofday.c select.c signal.c msgb.c b conv.c application.c rbtree.c strrb.c \ loggingrb.c crc8gen.c crc16gen.c crc32gen.c crc64gen.c \ macaddr.c stat_item.c stats.c stats_statsd.c prim.c \ - viterbi.c viterbi_generic.c sercomm.c + conv_acc.c conv_acc_generic.c sercomm.c if HAVE_SSE3 -libosmocore_la_SOURCES += viterbi_sse.c +libosmocore_la_SOURCES += conv_acc_sse.c if HAVE_SSE4_1 -viterbi_sse.lo : CFLAGS += -msse3 -msse4.1 +conv_acc_sse.lo : CFLAGS += -msse3 -msse4.1 else -viterbi_sse.lo : CFLAGS += -msse3 +conv_acc_sse.lo : CFLAGS += -msse3 endif if HAVE_AVX2 -libosmocore_la_SOURCES += viterbi_sse_avx.c +libosmocore_la_SOURCES += conv_acc_sse_avx.c if HAVE_SSE4_1 -viterbi_sse_avx.lo : CFLAGS += -msse3 -mavx2 -msse4.1 +conv_acc_sse_avx.lo : CFLAGS += -msse3 -mavx2 -msse4.1 else -viterbi_sse_avx.lo : CFLAGS += -msse3 -mavx2 +conv_acc_sse_avx.lo : CFLAGS += -msse3 -mavx2 endif endif endif BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c -EXTRA_DIST = viterbi_sse_common.h +EXTRA_DIST = conv_acc_sse_impl.h if ENABLE_PLUGIN libosmocore_la_SOURCES += plugin.c diff --git a/src/conv_acc.c b/src/conv_acc.c new file mode 100644 index 00000000..308cfe07 --- /dev/null +++ b/src/conv_acc.c @@ -0,0 +1,728 @@ +/* + * Viterbi decoder + * + * Copyright (C) 2013, 2014 Thomas Tsou + * + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include + +#include "config.h" + +#include + +#define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1 +#define NUM_STATES(K) (K == 7 ? 64 : 16) + +#define INIT_POINTERS(simd) \ +{ \ + osmo_conv_metrics_k5_n2 = osmo_conv_##simd##_metrics_k5_n2; \ + osmo_conv_metrics_k5_n3 = osmo_conv_##simd##_metrics_k5_n3; \ + osmo_conv_metrics_k5_n4 = osmo_conv_##simd##_metrics_k5_n4; \ + osmo_conv_metrics_k7_n2 = osmo_conv_##simd##_metrics_k7_n2; \ + osmo_conv_metrics_k7_n3 = osmo_conv_##simd##_metrics_k7_n3; \ + osmo_conv_metrics_k7_n4 = osmo_conv_##simd##_metrics_k7_n4; \ + vdec_malloc = &osmo_conv_##simd##_vdec_malloc; \ + vdec_free = &osmo_conv_##simd##_vdec_free; \ +} + +static int init_complete = 0; + +__attribute__ ((visibility("hidden"))) int avx2_supported = 0; +__attribute__ ((visibility("hidden"))) int sse3_supported = 0; +__attribute__ ((visibility("hidden"))) int sse41_supported = 0; + +/** + * These pointers are being initialized at runtime by the + * osmo_conv_init() depending on supported SIMD extensions. + */ +static int16_t *(*vdec_malloc)(size_t n); +static void (*vdec_free)(int16_t *ptr); + +void (*osmo_conv_metrics_k5_n2)(const int8_t *seq, + const int16_t *out, int16_t *sums, int16_t *paths, int norm); +void (*osmo_conv_metrics_k5_n3)(const int8_t *seq, + const int16_t *out, int16_t *sums, int16_t *paths, int norm); +void (*osmo_conv_metrics_k5_n4)(const int8_t *seq, + const int16_t *out, int16_t *sums, int16_t *paths, int norm); +void (*osmo_conv_metrics_k7_n2)(const int8_t *seq, + const int16_t *out, int16_t *sums, int16_t *paths, int norm); +void (*osmo_conv_metrics_k7_n3)(const int8_t *seq, + const int16_t *out, int16_t *sums, int16_t *paths, int norm); +void (*osmo_conv_metrics_k7_n4)(const int8_t *seq, + const int16_t *out, int16_t *sums, int16_t *paths, int norm); + +/* Forward malloc wrappers */ +int16_t *osmo_conv_gen_vdec_malloc(size_t n); +void osmo_conv_gen_vdec_free(int16_t *ptr); + +#if defined(HAVE_SSE3) +int16_t *osmo_conv_sse_vdec_malloc(size_t n); +void osmo_conv_sse_vdec_free(int16_t *ptr); +#endif + +#if defined(HAVE_SSE3) && defined(HAVE_AVX2) +int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n); +void osmo_conv_sse_avx_vdec_free(int16_t *ptr); +#endif + +/* Forward Metric Units */ +void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_gen_metrics_k5_n3(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_gen_metrics_k5_n4(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_gen_metrics_k7_n2(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_gen_metrics_k7_n3(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); + +#if defined(HAVE_SSE3) +void osmo_conv_sse_metrics_k5_n2(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_sse_metrics_k5_n3(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_sse_metrics_k5_n4(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_sse_metrics_k7_n2(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_sse_metrics_k7_n3(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_sse_metrics_k7_n4(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +#endif + +#if defined(HAVE_SSE3) && defined(HAVE_AVX2) +void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm); +#endif + +/* Trellis State + * state - Internal lshift register value + * prev - Register values of previous 0 and 1 states + */ +struct vstate { + unsigned state; + unsigned prev[2]; +}; + +/* Trellis Object + * num_states - Number of states in the trellis + * sums - Accumulated path metrics + * outputs - Trellis output values + * vals - Input value that led to each state + */ +struct vtrellis { + int num_states; + int16_t *sums; + int16_t *outputs; + uint8_t *vals; +}; + +/* Viterbi Decoder + * n - Code order + * k - Constraint length + * len - Horizontal length of trellis + * recursive - Set to '1' if the code is recursive + * intrvl - Normalization interval + * trellis - Trellis object + * punc - Puncturing sequence + * paths - Trellis paths + */ +struct vdecoder { + int n; + int k; + int len; + int recursive; + int intrvl; + struct vtrellis *trellis; + int *punc; + int16_t **paths; + + void (*metric_func)(const int8_t *, const int16_t *, + int16_t *, int16_t *, int); +}; + +/* Accessor calls */ +static inline int conv_code_recursive(const struct osmo_conv_code *code) +{ + return code->next_term_output ? 1 : 0; +} + +/* Left shift and mask for finding the previous state */ +static unsigned vstate_lshift(unsigned reg, int k, int val) +{ + unsigned mask; + + if (k == 5) + mask = 0x0e; + else if (k == 7) + mask = 0x3e; + else + mask = 0; + + return ((reg << 1) & mask) | val; +} + +/* Bit endian manipulators */ +static inline unsigned bitswap2(unsigned v) +{ + return ((v & 0x02) >> 1) | ((v & 0x01) << 1); +} + +static inline unsigned bitswap3(unsigned v) +{ + return ((v & 0x04) >> 2) | ((v & 0x02) >> 0) | + ((v & 0x01) << 2); +} + +static inline unsigned bitswap4(unsigned v) +{ + return ((v & 0x08) >> 3) | ((v & 0x04) >> 1) | + ((v & 0x02) << 1) | ((v & 0x01) << 3); +} + +static inline unsigned bitswap5(unsigned v) +{ + return ((v & 0x10) >> 4) | ((v & 0x08) >> 2) | ((v & 0x04) >> 0) | + ((v & 0x02) << 2) | ((v & 0x01) << 4); +} + +static inline unsigned bitswap6(unsigned v) +{ + return ((v & 0x20) >> 5) | ((v & 0x10) >> 3) | ((v & 0x08) >> 1) | + ((v & 0x04) << 1) | ((v & 0x02) << 3) | ((v & 0x01) << 5); +} + +static unsigned bitswap(unsigned v, unsigned n) +{ + switch (n) { + case 1: + return v; + case 2: + return bitswap2(v); + case 3: + return bitswap3(v); + case 4: + return bitswap4(v); + case 5: + return bitswap5(v); + case 6: + return bitswap6(v); + default: + return 0; + } +} + +/* Generate non-recursive state output from generator state table + * Note that the shift register moves right (i.e. the most recent bit is + * shifted into the register at k-1 bit of the register), which is typical + * textbook representation. The API transition table expects the most recent + * bit in the low order bit, or left shift. A bitswap operation is required + * to accommodate the difference. + */ +static unsigned gen_output(struct vstate *state, int val, + const struct osmo_conv_code *code) +{ + unsigned out, prev; + + prev = bitswap(state->prev[0], code->K - 1); + out = code->next_output[prev][val]; + out = bitswap(out, code->N); + + return out; +} + +/* Populate non-recursive trellis state + * For a given state defined by the k-1 length shift register, find the + * value of the input bit that drove the trellis to that state. Also + * generate the N outputs of the generator polynomial at that state. + */ +static int gen_state_info(uint8_t *val, unsigned reg, + int16_t *output, const struct osmo_conv_code *code) +{ + int i; + unsigned out; + struct vstate state; + + /* Previous '0' state */ + state.state = reg; + state.prev[0] = vstate_lshift(reg, code->K, 0); + state.prev[1] = vstate_lshift(reg, code->K, 1); + + *val = (reg >> (code->K - 2)) & 0x01; + + /* Transition output */ + out = gen_output(&state, *val, code); + + /* Unpack to NRZ */ + for (i = 0; i < code->N; i++) + output[i] = BIT2NRZ(out, i); + + return 0; +} + +/* Generate recursive state output from generator state table */ +static unsigned gen_recursive_output(struct vstate *state, + uint8_t *val, unsigned reg, + const struct osmo_conv_code *code, int pos) +{ + int val0, val1; + unsigned out, prev; + + /* Previous '0' state */ + prev = vstate_lshift(reg, code->K, 0); + prev = bitswap(prev, code->K - 1); + + /* Input value */ + val0 = (reg >> (code->K - 2)) & 0x01; + val1 = (code->next_term_output[prev] >> pos) & 0x01; + *val = val0 == val1 ? 0 : 1; + + /* Wrapper for osmocom state access */ + prev = bitswap(state->prev[0], code->K - 1); + + /* Compute the transition output */ + out = code->next_output[prev][*val]; + out = bitswap(out, code->N); + + return out; +} + +/* Populate recursive trellis state + * The bit position of the systematic bit is not explicitly marked by the + * API, so it must be extracted from the generator table. Otherwise, + * populate the trellis similar to the non-recursive version. + * Non-systematic recursive codes are not supported. + */ +static int gen_recursive_state_info(uint8_t *val, + unsigned reg, int16_t *output, const struct osmo_conv_code *code) +{ + int i, j, pos = -1; + int ns = NUM_STATES(code->K); + unsigned out; + struct vstate state; + + /* Previous '0' and '1' states */ + state.state = reg; + state.prev[0] = vstate_lshift(reg, code->K, 0); + state.prev[1] = vstate_lshift(reg, code->K, 1); + + /* Find recursive bit location */ + for (i = 0; i < code->N; i++) { + for (j = 0; j < ns; j++) { + if ((code->next_output[j][0] >> i) & 0x01) + break; + } + + if (j == ns) { + pos = i; + break; + } + } + + /* Non-systematic recursive code not supported */ + if (pos < 0) + return -EPROTO; + + /* Transition output */ + out = gen_recursive_output(&state, val, reg, code, pos); + + /* Unpack to NRZ */ + for (i = 0; i < code->N; i++) + output[i] = BIT2NRZ(out, i); + + return 0; +} + +/* Release the trellis */ +static void free_trellis(struct vtrellis *trellis) +{ + if (!trellis) + return; + + vdec_free(trellis->outputs); + vdec_free(trellis->sums); + free(trellis->vals); + free(trellis); +} + +/* Allocate and initialize the trellis object + * Initialization consists of generating the outputs and output value of a + * given state. Due to trellis symmetry and anti-symmetry, only one of the + * transition paths is utilized by the butterfly operation in the forward + * recursion, so only one set of N outputs is required per state variable. + */ +static struct vtrellis *generate_trellis(const struct osmo_conv_code *code) +{ + int i, rc = -1; + struct vtrellis *trellis; + int16_t *outputs; + + int ns = NUM_STATES(code->K); + int recursive = conv_code_recursive(code); + int olen = (code->N == 2) ? 2 : 4; + + trellis = (struct vtrellis *) calloc(1, sizeof(struct vtrellis)); + if (!trellis) + goto fail; + + trellis->num_states = ns; + trellis->sums = vdec_malloc(ns); + trellis->outputs = vdec_malloc(ns * olen); + trellis->vals = (uint8_t *) malloc(ns * sizeof(uint8_t)); + + if (!trellis->sums || !trellis->outputs || !trellis->vals) + goto fail; + + /* Populate the trellis state objects */ + for (i = 0; i < ns; i++) { + outputs = &trellis->outputs[olen * i]; + if (recursive) { + rc = gen_recursive_state_info(&trellis->vals[i], + i, outputs, code); + } else { + rc = gen_state_info(&trellis->vals[i], + i, outputs, code); + } + } + + if (rc < 0) + goto fail; + + return trellis; + +fail: + free_trellis(trellis); + return NULL; +} + +/* Reset decoder + * Set accumulated path metrics to zero. For termination other than + * tail-biting, initialize the zero state as the encoder starting state. + * Initialize with the maximum accumulated sum at length equal to the + * constraint length. + */ +static void reset_decoder(struct vdecoder *dec, int term) +{ + int ns = dec->trellis->num_states; + + memset(dec->trellis->sums, 0, sizeof(int16_t) * ns); + + if (term != CONV_TERM_TAIL_BITING) + dec->trellis->sums[0] = INT8_MAX * dec->n * dec->k; +} + +static void _traceback(struct vdecoder *dec, + unsigned state, uint8_t *out, int len) +{ + int i; + unsigned path; + + for (i = len - 1; i >= 0; i--) { + path = dec->paths[i][state] + 1; + out[i] = dec->trellis->vals[state]; + state = vstate_lshift(state, dec->k, path); + } +} + +static void _traceback_rec(struct vdecoder *dec, + unsigned state, uint8_t *out, int len) +{ + int i; + unsigned path; + + for (i = len - 1; i >= 0; i--) { + path = dec->paths[i][state] + 1; + out[i] = path ^ dec->trellis->vals[state]; + state = vstate_lshift(state, dec->k, path); + } +} + +/* Traceback and generate decoded output + * Find the largest accumulated path metric at the final state except for + * the zero terminated case, where we assume the final state is always zero. + */ +static int traceback(struct vdecoder *dec, uint8_t *out, int term, int len) +{ + int i, sum, max = -1; + unsigned path, state = 0; + + if (term != CONV_TERM_FLUSH) { + for (i = 0; i < dec->trellis->num_states; i++) { + sum = dec->trellis->sums[i]; + if (sum > max) { + max = sum; + state = i; + } + } + + if (max < 0) + return -EPROTO; + } + + for (i = dec->len - 1; i >= len; i--) { + path = dec->paths[i][state] + 1; + state = vstate_lshift(state, dec->k, path); + } + + if (dec->recursive) + _traceback_rec(dec, state, out, len); + else + _traceback(dec, state, out, len); + + return 0; +} + +/* Release decoder object */ +static void free_vdec(struct vdecoder *dec) +{ + if (!dec) + return; + + free_trellis(dec->trellis); + + if (dec->paths != NULL) { + vdec_free(dec->paths[0]); + free(dec->paths); + } + + free(dec); +} + +/* Allocate decoder object + * Subtract the constraint length K on the normalization interval to + * accommodate the initialization path metric at state zero. + */ +static struct vdecoder *alloc_vdec(const struct osmo_conv_code *code) +{ + int i, ns; + struct vdecoder *dec; + + ns = NUM_STATES(code->K); + + dec = (struct vdecoder *) calloc(1, sizeof(struct vdecoder)); + dec->n = code->N; + dec->k = code->K; + dec->recursive = conv_code_recursive(code); + dec->intrvl = INT16_MAX / (dec->n * INT8_MAX) - dec->k; + + if (dec->k == 5) { + switch (dec->n) { + case 2: + dec->metric_func = osmo_conv_metrics_k5_n2; + break; + case 3: + dec->metric_func = osmo_conv_metrics_k5_n3; + break; + case 4: + dec->metric_func = osmo_conv_metrics_k5_n4; + break; + default: + goto fail; + } + } else if (dec->k == 7) { + switch (dec->n) { + case 2: + dec->metric_func = osmo_conv_metrics_k7_n2; + break; + case 3: + dec->metric_func = osmo_conv_metrics_k7_n3; + break; + case 4: + dec->metric_func = osmo_conv_metrics_k7_n4; + break; + default: + goto fail; + } + } else { + goto fail; + } + + if (code->term == CONV_TERM_FLUSH) + dec->len = code->len + code->K - 1; + else + dec->len = code->len; + + dec->trellis = generate_trellis(code); + if (!dec->trellis) + goto fail; + + dec->paths = (int16_t **) malloc(sizeof(int16_t *) * dec->len); + if (!dec->paths) + goto fail; + + dec->paths[0] = vdec_malloc(ns * dec->len); + if (!dec->paths[0]) + goto fail; + + for (i = 1; i < dec->len; i++) + dec->paths[i] = &dec->paths[0][i * ns]; + + return dec; + +fail: + free_vdec(dec); + return NULL; +} + +/* Depuncture sequence with nagative value terminated puncturing matrix */ +static int depuncture(const int8_t *in, const int *punc, int8_t *out, int len) +{ + int i, n = 0, m = 0; + + for (i = 0; i < len; i++) { + if (i == punc[n]) { + out[i] = 0; + n++; + continue; + } + + out[i] = in[m++]; + } + + return 0; +} + +/* Forward trellis recursion + * Generate branch metrics and path metrics with a combined function. Only + * accumulated path metric sums and path selections are stored. Normalize on + * the interval specified by the decoder. + */ +static void forward_traverse(struct vdecoder *dec, const int8_t *seq) +{ + struct vtrellis *trellis = dec->trellis; + int i; + + for (i = 0; i < dec->len; i++) { + dec->metric_func(&seq[dec->n * i], + trellis->outputs, + trellis->sums, + dec->paths[i], + !(i % dec->intrvl)); + } +} + +/* Convolutional decode with a decoder object + * Initial puncturing run if necessary followed by the forward recursion. + * For tail-biting perform a second pass before running the backward + * traceback operation. + */ +static int conv_decode(struct vdecoder *dec, const int8_t *seq, + const int *punc, uint8_t *out, int len, int term) +{ + int8_t depunc[dec->len * dec->n]; + + reset_decoder(dec, term); + + if (punc) { + depuncture(seq, punc, depunc, dec->len * dec->n); + seq = depunc; + } + + /* Propagate through the trellis with interval normalization */ + forward_traverse(dec, seq); + + if (term == CONV_TERM_TAIL_BITING) + forward_traverse(dec, seq); + + return traceback(dec, out, term, len); +} + +static void osmo_conv_init(void) +{ + init_complete = 1; + +#ifdef HAVE___BUILTIN_CPU_SUPPORTS + /* Detect CPU capabilities */ + #ifdef HAVE_AVX2 + avx2_supported = __builtin_cpu_supports("avx2"); + #endif + + #ifdef HAVE_SSE3 + sse3_supported = __builtin_cpu_supports("sse3"); + #endif + + #ifdef HAVE_SSE4_1 + sse41_supported = __builtin_cpu_supports("sse4.1"); + #endif +#endif + +/** + * Usage of curly braces is mandatory, + * because we use multi-line define. + */ +#if defined(HAVE_SSE3) && defined(HAVE_AVX2) + if (sse3_supported && avx2_supported) { + INIT_POINTERS(sse_avx); + } else if (sse3_supported) { + INIT_POINTERS(sse); + } else { + INIT_POINTERS(gen); + } +#elif defined(HAVE_SSE3) + if (sse3_supported) { + INIT_POINTERS(sse); + } else { + INIT_POINTERS(gen); + } +#else + INIT_POINTERS(gen); +#endif +} + +/* All-in-one Viterbi decoding */ +int osmo_conv_decode_acc(const struct osmo_conv_code *code, + const sbit_t *input, ubit_t *output) +{ + int rc; + struct vdecoder *vdec; + + if (!init_complete) + osmo_conv_init(); + + if ((code->N < 2) || (code->N > 4) || (code->len < 1) || + ((code->K != 5) && (code->K != 7))) + return -EINVAL; + + vdec = alloc_vdec(code); + if (!vdec) + return -EFAULT; + + rc = conv_decode(vdec, input, code->puncture, + output, code->len, code->term); + + free_vdec(vdec); + + return rc; +} diff --git a/src/conv_acc_generic.c b/src/conv_acc_generic.c new file mode 100644 index 00000000..60426685 --- /dev/null +++ b/src/conv_acc_generic.c @@ -0,0 +1,207 @@ +/* + * Viterbi decoder + * + * Copyright (C) 2013, 2014 Thomas Tsou + * + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include + +/* Add-Compare-Select (ACS-Butterfly) + * Compute 4 accumulated path metrics and 4 path selections. Note that path + * selections are store as -1 and 0 rather than 0 and 1. This is to match + * the output format of the SSE packed compare instruction 'pmaxuw'. + */ + +static void acs_butterfly(int state, int num_states, + int16_t metric, int16_t *sum, + int16_t *new_sum, int16_t *path) +{ + int state0, state1; + int sum0, sum1, sum2, sum3; + + state0 = *(sum + (2 * state + 0)); + state1 = *(sum + (2 * state + 1)); + + sum0 = state0 + metric; + sum1 = state1 - metric; + sum2 = state0 - metric; + sum3 = state1 + metric; + + if (sum0 >= sum1) { + *new_sum = sum0; + *path = -1; + } else { + *new_sum = sum1; + *path = 0; + } + + if (sum2 >= sum3) { + *(new_sum + num_states / 2) = sum2; + *(path + num_states / 2) = -1; + } else { + *(new_sum + num_states / 2) = sum3; + *(path + num_states / 2) = 0; + } +} + +/* Branch metrics unit N=2 */ +static void gen_branch_metrics_n2(int num_states, const int8_t *seq, + const int16_t *out, int16_t *metrics) +{ + int i; + + for (i = 0; i < num_states / 2; i++) { + metrics[i] = seq[0] * out[2 * i + 0] + + seq[1] * out[2 * i + 1]; + } +} + +/* Branch metrics unit N=3 */ +static void gen_branch_metrics_n3(int num_states, const int8_t *seq, + const int16_t *out, int16_t *metrics) +{ + int i; + + for (i = 0; i < num_states / 2; i++) { + metrics[i] = seq[0] * out[4 * i + 0] + + seq[1] * out[4 * i + 1] + + seq[2] * out[4 * i + 2]; + } +} + +/* Branch metrics unit N=4 */ +static void gen_branch_metrics_n4(int num_states, const int8_t *seq, + const int16_t *out, int16_t *metrics) +{ + int i; + + for (i = 0; i < num_states / 2; i++) { + metrics[i] = seq[0] * out[4 * i + 0] + + seq[1] * out[4 * i + 1] + + seq[2] * out[4 * i + 2] + + seq[3] * out[4 * i + 3]; + } +} + +/* Path metric unit */ +static void gen_path_metrics(int num_states, int16_t *sums, + int16_t *metrics, int16_t *paths, int norm) +{ + int i; + int16_t min; + int16_t new_sums[num_states]; + + for (i = 0; i < num_states / 2; i++) + acs_butterfly(i, num_states, metrics[i], + sums, &new_sums[i], &paths[i]); + + if (norm) { + min = new_sums[0]; + + for (i = 1; i < num_states; i++) + if (new_sums[i] < min) + min = new_sums[i]; + + for (i = 0; i < num_states; i++) + new_sums[i] -= min; + } + + memcpy(sums, new_sums, num_states * sizeof(int16_t)); +} + +/* Not-aligned Memory Allocator */ +__attribute__ ((visibility("hidden"))) +int16_t *osmo_conv_gen_vdec_malloc(size_t n) +{ + return (int16_t *) malloc(sizeof(int16_t) * n); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_gen_vdec_free(int16_t *ptr) +{ + free(ptr); +} + +/* 16-state branch-path metrics units (K=5) */ +__attribute__ ((visibility("hidden"))) +void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + int16_t metrics[8]; + + gen_branch_metrics_n2(16, seq, out, metrics); + gen_path_metrics(16, sums, metrics, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_gen_metrics_k5_n3(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + int16_t metrics[8]; + + gen_branch_metrics_n3(16, seq, out, metrics); + gen_path_metrics(16, sums, metrics, paths, norm); + +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_gen_metrics_k5_n4(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + int16_t metrics[8]; + + gen_branch_metrics_n4(16, seq, out, metrics); + gen_path_metrics(16, sums, metrics, paths, norm); + +} + +/* 64-state branch-path metrics units (K=7) */ +__attribute__ ((visibility("hidden"))) +void osmo_conv_gen_metrics_k7_n2(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + int16_t metrics[32]; + + gen_branch_metrics_n2(64, seq, out, metrics); + gen_path_metrics(64, sums, metrics, paths, norm); + +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_gen_metrics_k7_n3(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + int16_t metrics[32]; + + gen_branch_metrics_n3(64, seq, out, metrics); + gen_path_metrics(64, sums, metrics, paths, norm); + +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + int16_t metrics[32]; + + gen_branch_metrics_n4(64, seq, out, metrics); + gen_path_metrics(64, sums, metrics, paths, norm); +} diff --git a/src/conv_acc_sse.c b/src/conv_acc_sse.c new file mode 100644 index 00000000..9cbcc2f1 --- /dev/null +++ b/src/conv_acc_sse.c @@ -0,0 +1,129 @@ +/* + * Intel SSE Viterbi decoder + * + * Copyright (C) 2013, 2014 Thomas Tsou + * + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include "config.h" + +#include +#include +#include + +#if defined(HAVE_SSE4_1) +#include +#endif + +#define SSE_ALIGN 16 + +/* Broadcast 16-bit integer + * Repeat the low 16-bit integer to all elements of the 128-bit SSE + * register. Only AVX2 has a dedicated broadcast instruction; use repeat + * unpacks for SSE only architectures. This is a destructive operation and + * the source register is overwritten. + * + * Input: + * M0 - Low 16-bit element is read + * + * Output: + * M0 - Contains broadcasted values + */ +#define SSE_BROADCAST(M0) \ +{ \ + M0 = _mm_unpacklo_epi16(M0, M0); \ + M0 = _mm_unpacklo_epi32(M0, M0); \ + M0 = _mm_unpacklo_epi64(M0, M0); \ +} + +/** + * Include common SSE implementation + */ +#include + +/* Aligned Memory Allocator + * SSE requires 16-byte memory alignment. We store relevant trellis values + * (accumulated sums, outputs, and path decisions) as 16 bit signed integers + * so the allocated memory is casted as such. + */ +__attribute__ ((visibility("hidden"))) +int16_t *osmo_conv_sse_vdec_malloc(size_t n) +{ + return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_vdec_free(int16_t *ptr) +{ + _mm_free(ptr); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_metrics_k5_n2(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; + + _sse_metrics_k5_n2(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_metrics_k5_n3(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], 0 }; + + _sse_metrics_k5_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_metrics_k5_n4(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; + + _sse_metrics_k5_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_metrics_k7_n2(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; + + _sse_metrics_k7_n2(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_metrics_k7_n3(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], 0 }; + + _sse_metrics_k7_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_metrics_k7_n4(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; + + _sse_metrics_k7_n4(_val, out, sums, paths, norm); +} diff --git a/src/conv_acc_sse_avx.c b/src/conv_acc_sse_avx.c new file mode 100644 index 00000000..5d9a82b7 --- /dev/null +++ b/src/conv_acc_sse_avx.c @@ -0,0 +1,129 @@ +/* + * Intel SSE + AVX Viterbi decoder + * + * Copyright (C) 2013, 2014 Thomas Tsou + * + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include "config.h" + +#include +#include +#include +#include + +#if defined(HAVE_SSE4_1) +#include +#endif + +#define SSE_ALIGN 16 + + +/* Broadcast 16-bit integer + * Repeat the low 16-bit integer to all elements of the 128-bit SSE + * register. Only AVX2 has a dedicated broadcast instruction; use repeat + * unpacks for SSE only architectures. This is a destructive operation and + * the source register is overwritten. + * + * Input: + * M0 - Low 16-bit element is read + * + * Output: + * M0 - Contains broadcasted values + */ +#define SSE_BROADCAST(M0) \ +{ \ + M0 = _mm_broadcastw_epi16(M0); \ +} + +/** + * Include common SSE implementation + */ +#include + +/* Aligned Memory Allocator + * SSE requires 16-byte memory alignment. We store relevant trellis values + * (accumulated sums, outputs, and path decisions) as 16 bit signed integers + * so the allocated memory is casted as such. + */ +__attribute__ ((visibility("hidden"))) +int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n) +{ + return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_avx_vdec_free(int16_t *ptr) +{ + _mm_free(ptr); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *val, + const int16_t *out, int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; + + _sse_metrics_k5_n2(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *val, + const int16_t *out, int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], 0 }; + + _sse_metrics_k5_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *val, + const int16_t *out, int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; + + _sse_metrics_k5_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *val, + const int16_t *out, int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; + + _sse_metrics_k7_n2(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *val, + const int16_t *out, int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], 0 }; + + _sse_metrics_k7_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *val, + const int16_t *out, int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; + + _sse_metrics_k7_n4(_val, out, sums, paths, norm); +} diff --git a/src/conv_acc_sse_impl.h b/src/conv_acc_sse_impl.h new file mode 100644 index 00000000..7d48c942 --- /dev/null +++ b/src/conv_acc_sse_impl.h @@ -0,0 +1,495 @@ +/* + * Intel SSE Viterbi decoder + * + * Copyright (C) 2013, 2014 Thomas Tsou + * + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +extern int sse41_supported; + +/* Octo-Viterbi butterfly + * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated + * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers. + * Two intermediate registers are used and results are set in the upper 4 + * registers. + * + * Input: + * M0 - Path metrics 0 (packed 16-bit integers) + * M1 - Path metrics 1 (packed 16-bit integers) + * M2 - Branch metrics (packed 16-bit integers) + * + * Output: + * M2 - Selected and accumulated path metrics 0 + * M4 - Selected and accumulated path metrics 1 + * M3 - Path selections 0 + * M1 - Path selections 1 + */ +#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \ +{ \ + M3 = _mm_adds_epi16(M0, M2); \ + M4 = _mm_subs_epi16(M1, M2); \ + M0 = _mm_subs_epi16(M0, M2); \ + M1 = _mm_adds_epi16(M1, M2); \ + M2 = _mm_max_epi16(M3, M4); \ + M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \ + M4 = _mm_max_epi16(M0, M1); \ + M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \ +} + +/* Two lane deinterleaving K = 5: + * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit + * registers. The operation summarized below. Four registers are used with + * the lower 2 as input and upper 2 as output. + * + * In - 10101010 10101010 10101010 10101010 + * Out - 00000000 11111111 00000000 11111111 + * + * Input: + * M0:1 - Packed 16-bit integers + * + * Output: + * M2:3 - Deinterleaved packed 16-bit integers + */ +#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + +#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \ +{ \ + M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \ + M0 = _mm_shuffle_epi8(M0, M2); \ + M1 = _mm_shuffle_epi8(M1, M2); \ + M2 = _mm_unpacklo_epi64(M0, M1); \ + M3 = _mm_unpackhi_epi64(M0, M1); \ +} + +/* Two lane deinterleaving K = 7: + * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit + * registers. The operation summarized below. 16 registers are used with the + * lower 8 as input and upper 8 as output. + * + * In - 10101010 10101010 10101010 10101010 ... + * Out - 00000000 11111111 00000000 11111111 ... + * + * Input: + * M0:7 - Packed 16-bit integers + * + * Output: + * M8:15 - Deinterleaved packed 16-bit integers + */ +#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \ + M8, M9, M10, M11, M12, M13, M14, M15) \ +{ \ + M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \ + M0 = _mm_shuffle_epi8(M0, M8); \ + M1 = _mm_shuffle_epi8(M1, M8); \ + M2 = _mm_shuffle_epi8(M2, M8); \ + M3 = _mm_shuffle_epi8(M3, M8); \ + M4 = _mm_shuffle_epi8(M4, M8); \ + M5 = _mm_shuffle_epi8(M5, M8); \ + M6 = _mm_shuffle_epi8(M6, M8); \ + M7 = _mm_shuffle_epi8(M7, M8); \ + M8 = _mm_unpacklo_epi64(M0, M1); \ + M9 = _mm_unpackhi_epi64(M0, M1); \ + M10 = _mm_unpacklo_epi64(M2, M3); \ + M11 = _mm_unpackhi_epi64(M2, M3); \ + M12 = _mm_unpacklo_epi64(M4, M5); \ + M13 = _mm_unpackhi_epi64(M4, M5); \ + M14 = _mm_unpacklo_epi64(M6, M7); \ + M15 = _mm_unpackhi_epi64(M6, M7); \ +} + +/* Generate branch metrics N = 2: + * Compute 16 branch metrics from trellis outputs and input values. + * + * Input: + * M0:3 - 16 x 2 packed 16-bit trellis outputs + * M4 - Expanded and packed 16-bit input value + * + * Output: + * M6:7 - 16 computed 16-bit branch metrics + */ +#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \ +{ \ + M0 = _mm_sign_epi16(M4, M0); \ + M1 = _mm_sign_epi16(M4, M1); \ + M2 = _mm_sign_epi16(M4, M2); \ + M3 = _mm_sign_epi16(M4, M3); \ + M6 = _mm_hadds_epi16(M0, M1); \ + M7 = _mm_hadds_epi16(M2, M3); \ +} + +/* Generate branch metrics N = 4: + * Compute 8 branch metrics from trellis outputs and input values. This + * macro is reused for N less than 4 where the extra soft input bits are + * padded. + * + * Input: + * M0:3 - 8 x 4 packed 16-bit trellis outputs + * M4 - Expanded and packed 16-bit input value + * + * Output: + * M5 - 8 computed 16-bit branch metrics + */ +#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \ +{ \ + M0 = _mm_sign_epi16(M4, M0); \ + M1 = _mm_sign_epi16(M4, M1); \ + M2 = _mm_sign_epi16(M4, M2); \ + M3 = _mm_sign_epi16(M4, M3); \ + M0 = _mm_hadds_epi16(M0, M1); \ + M1 = _mm_hadds_epi16(M2, M3); \ + M5 = _mm_hadds_epi16(M0, M1); \ +} + +/* Horizontal minimum + * Compute horizontal minimum of packed unsigned 16-bit integers and place + * result in the low 16-bit element of the source register. Only SSE 4.1 + * has a dedicated minpos instruction. One intermediate register is used + * if SSE 4.1 is not available. This is a destructive operation and the + * source register is overwritten. + * + * Input: + * M0 - Packed unsigned 16-bit integers + * + * Output: + * M0 - Minimum value placed in low 16-bit element + */ +#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41) +#define SSE_MINPOS(M0, M1) \ +{ \ + if (sse41_supported) { \ + M0 = _mm_minpos_epu16(M0); \ + } else { \ + M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \ + M0 = _mm_min_epi16(M0, M1); \ + M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \ + M0 = _mm_min_epi16(M0, M1); \ + M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \ + M0 = _mm_min_epi16(M0, M1); \ + } \ +} +#else +#define SSE_MINPOS(M0, M1) \ +{ \ + M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \ + M0 = _mm_min_epi16(M0, M1); \ + M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \ + M0 = _mm_min_epi16(M0, M1); \ + M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \ + M0 = _mm_min_epi16(M0, M1); \ +} +#endif + +/* Normalize state metrics K = 5: + * Compute 16-wide normalization by subtracting the smallest value from + * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers. + * Two intermediate registers are used and normalized results are placed + * in the originating locations. + * + * Input: + * M0:1 - Path metrics 0:1 (packed 16-bit integers) + * + * Output: + * M0:1 - Normalized path metrics 0:1 + */ +#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \ +{ \ + M2 = _mm_min_epi16(M0, M1); \ + SSE_MINPOS(M2, M3) \ + SSE_BROADCAST(M2) \ + M0 = _mm_subs_epi16(M0, M2); \ + M1 = _mm_subs_epi16(M1, M2); \ +} + +/* Normalize state metrics K = 7: + * Compute 64-wide normalization by subtracting the smallest value from + * all values. Inputs are 8 registers of accumulated sums and 4 temporary + * registers. Normalized results are returned in the originating locations. + * + * Input: + * M0:7 - Path metrics 0:7 (packed 16-bit integers) + * + * Output: + * M0:7 - Normalized path metrics 0:7 + */ +#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \ +{ \ + M8 = _mm_min_epi16(M0, M1); \ + M9 = _mm_min_epi16(M2, M3); \ + M10 = _mm_min_epi16(M4, M5); \ + M11 = _mm_min_epi16(M6, M7); \ + M8 = _mm_min_epi16(M8, M9); \ + M10 = _mm_min_epi16(M10, M11); \ + M8 = _mm_min_epi16(M8, M10); \ + SSE_MINPOS(M8, M9) \ + SSE_BROADCAST(M8) \ + M0 = _mm_subs_epi16(M0, M8); \ + M1 = _mm_subs_epi16(M1, M8); \ + M2 = _mm_subs_epi16(M2, M8); \ + M3 = _mm_subs_epi16(M3, M8); \ + M4 = _mm_subs_epi16(M4, M8); \ + M5 = _mm_subs_epi16(M5, M8); \ + M6 = _mm_subs_epi16(M6, M8); \ + M7 = _mm_subs_epi16(M7, M8); \ +} + +/* Combined BMU/PMU (K=5, N=2) + * Compute branch metrics followed by path metrics for half rate 16-state + * trellis. 8 butterflies are computed. Accumulated path sums are not + * preserved and read and written into the same memory location. Normalize + * sums if requires. + */ +__always_inline static void _sse_metrics_k5_n2(const int16_t *val, + const int16_t *out, int16_t *sums, int16_t *paths, int norm) +{ + __m128i m0, m1, m2, m3, m4, m5, m6; + + /* (BMU) Load input sequence */ + m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val)); + + /* (BMU) Load trellis outputs */ + m0 = _mm_load_si128((__m128i *) &out[0]); + m1 = _mm_load_si128((__m128i *) &out[8]); + + /* (BMU) Compute branch metrics */ + m0 = _mm_sign_epi16(m2, m0); + m1 = _mm_sign_epi16(m2, m1); + m2 = _mm_hadds_epi16(m0, m1); + + /* (PMU) Load accumulated path metrics */ + m0 = _mm_load_si128((__m128i *) &sums[0]); + m1 = _mm_load_si128((__m128i *) &sums[8]); + + SSE_DEINTERLEAVE_K5(m0, m1, m3, m4) + + /* (PMU) Butterflies: 0-7 */ + SSE_BUTTERFLY(m3, m4, m2, m5, m6) + + if (norm) + SSE_NORMALIZE_K5(m2, m6, m0, m1) + + _mm_store_si128((__m128i *) &sums[0], m2); + _mm_store_si128((__m128i *) &sums[8], m6); + _mm_store_si128((__m128i *) &paths[0], m5); + _mm_store_si128((__m128i *) &paths[8], m4); +} + +/* Combined BMU/PMU (K=5, N=3 and N=4) + * Compute branch metrics followed by path metrics for 16-state and rates + * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit + * values at a time, and extra values should be set to zero for rates other + * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a + * dedicated implementation of rate 1/2. + */ +__always_inline static void _sse_metrics_k5_n4(const int16_t *val, + const int16_t *out, int16_t *sums, int16_t *paths, int norm) +{ + __m128i m0, m1, m2, m3, m4, m5, m6; + + /* (BMU) Load input sequence */ + m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val)); + + /* (BMU) Load trellis outputs */ + m0 = _mm_load_si128((__m128i *) &out[0]); + m1 = _mm_load_si128((__m128i *) &out[8]); + m2 = _mm_load_si128((__m128i *) &out[16]); + m3 = _mm_load_si128((__m128i *) &out[24]); + + SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2) + + /* (PMU) Load accumulated path metrics */ + m0 = _mm_load_si128((__m128i *) &sums[0]); + m1 = _mm_load_si128((__m128i *) &sums[8]); + + SSE_DEINTERLEAVE_K5(m0, m1, m3, m4) + + /* (PMU) Butterflies: 0-7 */ + SSE_BUTTERFLY(m3, m4, m2, m5, m6) + + if (norm) + SSE_NORMALIZE_K5(m2, m6, m0, m1) + + _mm_store_si128((__m128i *) &sums[0], m2); + _mm_store_si128((__m128i *) &sums[8], m6); + _mm_store_si128((__m128i *) &paths[0], m5); + _mm_store_si128((__m128i *) &paths[8], m4); +} + +/* Combined BMU/PMU (K=7, N=2) + * Compute branch metrics followed by path metrics for half rate 64-state + * trellis. 32 butterfly operations are computed. Deinterleaving path + * metrics requires usage of the full SSE register file, so separate sums + * before computing branch metrics to avoid register spilling. + */ +__always_inline static void _sse_metrics_k7_n2(const int16_t *val, + const int16_t *out, int16_t *sums, int16_t *paths, int norm) +{ + __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8, + m9, m10, m11, m12, m13, m14, m15; + + /* (PMU) Load accumulated path metrics */ + m0 = _mm_load_si128((__m128i *) &sums[0]); + m1 = _mm_load_si128((__m128i *) &sums[8]); + m2 = _mm_load_si128((__m128i *) &sums[16]); + m3 = _mm_load_si128((__m128i *) &sums[24]); + m4 = _mm_load_si128((__m128i *) &sums[32]); + m5 = _mm_load_si128((__m128i *) &sums[40]); + m6 = _mm_load_si128((__m128i *) &sums[48]); + m7 = _mm_load_si128((__m128i *) &sums[56]); + + /* (PMU) Deinterleave to even-odd registers */ + SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7, + m8, m9, m10, m11, m12, m13, m14, m15) + + /* (BMU) Load input symbols */ + m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val)); + + /* (BMU) Load trellis outputs */ + m0 = _mm_load_si128((__m128i *) &out[0]); + m1 = _mm_load_si128((__m128i *) &out[8]); + m2 = _mm_load_si128((__m128i *) &out[16]); + m3 = _mm_load_si128((__m128i *) &out[24]); + + SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5) + + m0 = _mm_load_si128((__m128i *) &out[32]); + m1 = _mm_load_si128((__m128i *) &out[40]); + m2 = _mm_load_si128((__m128i *) &out[48]); + m3 = _mm_load_si128((__m128i *) &out[56]); + + SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7) + + /* (PMU) Butterflies: 0-15 */ + SSE_BUTTERFLY(m8, m9, m4, m0, m1) + SSE_BUTTERFLY(m10, m11, m5, m2, m3) + + _mm_store_si128((__m128i *) &paths[0], m0); + _mm_store_si128((__m128i *) &paths[8], m2); + _mm_store_si128((__m128i *) &paths[32], m9); + _mm_store_si128((__m128i *) &paths[40], m11); + + /* (PMU) Butterflies: 17-31 */ + SSE_BUTTERFLY(m12, m13, m6, m0, m2) + SSE_BUTTERFLY(m14, m15, m7, m9, m11) + + _mm_store_si128((__m128i *) &paths[16], m0); + _mm_store_si128((__m128i *) &paths[24], m9); + _mm_store_si128((__m128i *) &paths[48], m13); + _mm_store_si128((__m128i *) &paths[56], m15); + + if (norm) + SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, + m7, m11, m0, m8, m9, m10) + + _mm_store_si128((__m128i *) &sums[0], m4); + _mm_store_si128((__m128i *) &sums[8], m5); + _mm_store_si128((__m128i *) &sums[16], m6); + _mm_store_si128((__m128i *) &sums[24], m7); + _mm_store_si128((__m128i *) &sums[32], m1); + _mm_store_si128((__m128i *) &sums[40], m3); + _mm_store_si128((__m128i *) &sums[48], m2); + _mm_store_si128((__m128i *) &sums[56], m11); +} + +/* Combined BMU/PMU (K=7, N=3 and N=4) + * Compute branch metrics followed by path metrics for half rate 64-state + * trellis. 32 butterfly operations are computed. Deinterleave path + * metrics before computing branch metrics as in the half rate case. + */ +__always_inline static void _sse_metrics_k7_n4(const int16_t *val, + const int16_t *out, int16_t *sums, int16_t *paths, int norm) +{ + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i m8, m9, m10, m11, m12, m13, m14, m15; + + /* (PMU) Load accumulated path metrics */ + m0 = _mm_load_si128((__m128i *) &sums[0]); + m1 = _mm_load_si128((__m128i *) &sums[8]); + m2 = _mm_load_si128((__m128i *) &sums[16]); + m3 = _mm_load_si128((__m128i *) &sums[24]); + m4 = _mm_load_si128((__m128i *) &sums[32]); + m5 = _mm_load_si128((__m128i *) &sums[40]); + m6 = _mm_load_si128((__m128i *) &sums[48]); + m7 = _mm_load_si128((__m128i *) &sums[56]); + + /* (PMU) Deinterleave into even and odd packed registers */ + SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7, + m8, m9, m10, m11, m12, m13, m14, m15) + + /* (BMU) Load and expand 8-bit input out to 16-bits */ + m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val)); + + /* (BMU) Load and compute branch metrics */ + m0 = _mm_load_si128((__m128i *) &out[0]); + m1 = _mm_load_si128((__m128i *) &out[8]); + m2 = _mm_load_si128((__m128i *) &out[16]); + m3 = _mm_load_si128((__m128i *) &out[24]); + + SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4) + + m0 = _mm_load_si128((__m128i *) &out[32]); + m1 = _mm_load_si128((__m128i *) &out[40]); + m2 = _mm_load_si128((__m128i *) &out[48]); + m3 = _mm_load_si128((__m128i *) &out[56]); + + SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5) + + m0 = _mm_load_si128((__m128i *) &out[64]); + m1 = _mm_load_si128((__m128i *) &out[72]); + m2 = _mm_load_si128((__m128i *) &out[80]); + m3 = _mm_load_si128((__m128i *) &out[88]); + + SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6) + + m0 = _mm_load_si128((__m128i *) &out[96]); + m1 = _mm_load_si128((__m128i *) &out[104]); + m2 = _mm_load_si128((__m128i *) &out[112]); + m3 = _mm_load_si128((__m128i *) &out[120]); + + SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7) + + /* (PMU) Butterflies: 0-15 */ + SSE_BUTTERFLY(m8, m9, m4, m0, m1) + SSE_BUTTERFLY(m10, m11, m5, m2, m3) + + _mm_store_si128((__m128i *) &paths[0], m0); + _mm_store_si128((__m128i *) &paths[8], m2); + _mm_store_si128((__m128i *) &paths[32], m9); + _mm_store_si128((__m128i *) &paths[40], m11); + + /* (PMU) Butterflies: 17-31 */ + SSE_BUTTERFLY(m12, m13, m6, m0, m2) + SSE_BUTTERFLY(m14, m15, m7, m9, m11) + + _mm_store_si128((__m128i *) &paths[16], m0); + _mm_store_si128((__m128i *) &paths[24], m9); + _mm_store_si128((__m128i *) &paths[48], m13); + _mm_store_si128((__m128i *) &paths[56], m15); + + if (norm) + SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, + m7, m11, m0, m8, m9, m10) + + _mm_store_si128((__m128i *) &sums[0], m4); + _mm_store_si128((__m128i *) &sums[8], m5); + _mm_store_si128((__m128i *) &sums[16], m6); + _mm_store_si128((__m128i *) &sums[24], m7); + _mm_store_si128((__m128i *) &sums[32], m1); + _mm_store_si128((__m128i *) &sums[40], m3); + _mm_store_si128((__m128i *) &sums[48], m2); + _mm_store_si128((__m128i *) &sums[56], m11); +} diff --git a/src/viterbi.c b/src/viterbi.c deleted file mode 100644 index 308cfe07..00000000 --- a/src/viterbi.c +++ /dev/null @@ -1,728 +0,0 @@ -/* - * Viterbi decoder - * - * Copyright (C) 2013, 2014 Thomas Tsou - * - * All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include -#include -#include - -#include "config.h" - -#include - -#define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1 -#define NUM_STATES(K) (K == 7 ? 64 : 16) - -#define INIT_POINTERS(simd) \ -{ \ - osmo_conv_metrics_k5_n2 = osmo_conv_##simd##_metrics_k5_n2; \ - osmo_conv_metrics_k5_n3 = osmo_conv_##simd##_metrics_k5_n3; \ - osmo_conv_metrics_k5_n4 = osmo_conv_##simd##_metrics_k5_n4; \ - osmo_conv_metrics_k7_n2 = osmo_conv_##simd##_metrics_k7_n2; \ - osmo_conv_metrics_k7_n3 = osmo_conv_##simd##_metrics_k7_n3; \ - osmo_conv_metrics_k7_n4 = osmo_conv_##simd##_metrics_k7_n4; \ - vdec_malloc = &osmo_conv_##simd##_vdec_malloc; \ - vdec_free = &osmo_conv_##simd##_vdec_free; \ -} - -static int init_complete = 0; - -__attribute__ ((visibility("hidden"))) int avx2_supported = 0; -__attribute__ ((visibility("hidden"))) int sse3_supported = 0; -__attribute__ ((visibility("hidden"))) int sse41_supported = 0; - -/** - * These pointers are being initialized at runtime by the - * osmo_conv_init() depending on supported SIMD extensions. - */ -static int16_t *(*vdec_malloc)(size_t n); -static void (*vdec_free)(int16_t *ptr); - -void (*osmo_conv_metrics_k5_n2)(const int8_t *seq, - const int16_t *out, int16_t *sums, int16_t *paths, int norm); -void (*osmo_conv_metrics_k5_n3)(const int8_t *seq, - const int16_t *out, int16_t *sums, int16_t *paths, int norm); -void (*osmo_conv_metrics_k5_n4)(const int8_t *seq, - const int16_t *out, int16_t *sums, int16_t *paths, int norm); -void (*osmo_conv_metrics_k7_n2)(const int8_t *seq, - const int16_t *out, int16_t *sums, int16_t *paths, int norm); -void (*osmo_conv_metrics_k7_n3)(const int8_t *seq, - const int16_t *out, int16_t *sums, int16_t *paths, int norm); -void (*osmo_conv_metrics_k7_n4)(const int8_t *seq, - const int16_t *out, int16_t *sums, int16_t *paths, int norm); - -/* Forward malloc wrappers */ -int16_t *osmo_conv_gen_vdec_malloc(size_t n); -void osmo_conv_gen_vdec_free(int16_t *ptr); - -#if defined(HAVE_SSE3) -int16_t *osmo_conv_sse_vdec_malloc(size_t n); -void osmo_conv_sse_vdec_free(int16_t *ptr); -#endif - -#if defined(HAVE_SSE3) && defined(HAVE_AVX2) -int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n); -void osmo_conv_sse_avx_vdec_free(int16_t *ptr); -#endif - -/* Forward Metric Units */ -void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_gen_metrics_k5_n3(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_gen_metrics_k5_n4(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_gen_metrics_k7_n2(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_gen_metrics_k7_n3(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); - -#if defined(HAVE_SSE3) -void osmo_conv_sse_metrics_k5_n2(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_sse_metrics_k5_n3(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_sse_metrics_k5_n4(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_sse_metrics_k7_n2(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_sse_metrics_k7_n3(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_sse_metrics_k7_n4(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -#endif - -#if defined(HAVE_SSE3) && defined(HAVE_AVX2) -void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *seq, const int16_t *out, - int16_t *sums, int16_t *paths, int norm); -#endif - -/* Trellis State - * state - Internal lshift register value - * prev - Register values of previous 0 and 1 states - */ -struct vstate { - unsigned state; - unsigned prev[2]; -}; - -/* Trellis Object - * num_states - Number of states in the trellis - * sums - Accumulated path metrics - * outputs - Trellis output values - * vals - Input value that led to each state - */ -struct vtrellis { - int num_states; - int16_t *sums; - int16_t *outputs; - uint8_t *vals; -}; - -/* Viterbi Decoder - * n - Code order - * k - Constraint length - * len - Horizontal length of trellis - * recursive - Set to '1' if the code is recursive - * intrvl - Normalization interval - * trellis - Trellis object - * punc - Puncturing sequence - * paths - Trellis paths - */ -struct vdecoder { - int n; - int k; - int len; - int recursive; - int intrvl; - struct vtrellis *trellis; - int *punc; - int16_t **paths; - - void (*metric_func)(const int8_t *, const int16_t *, - int16_t *, int16_t *, int); -}; - -/* Accessor calls */ -static inline int conv_code_recursive(const struct osmo_conv_code *code) -{ - return code->next_term_output ? 1 : 0; -} - -/* Left shift and mask for finding the previous state */ -static unsigned vstate_lshift(unsigned reg, int k, int val) -{ - unsigned mask; - - if (k == 5) - mask = 0x0e; - else if (k == 7) - mask = 0x3e; - else - mask = 0; - - return ((reg << 1) & mask) | val; -} - -/* Bit endian manipulators */ -static inline unsigned bitswap2(unsigned v) -{ - return ((v & 0x02) >> 1) | ((v & 0x01) << 1); -} - -static inline unsigned bitswap3(unsigned v) -{ - return ((v & 0x04) >> 2) | ((v & 0x02) >> 0) | - ((v & 0x01) << 2); -} - -static inline unsigned bitswap4(unsigned v) -{ - return ((v & 0x08) >> 3) | ((v & 0x04) >> 1) | - ((v & 0x02) << 1) | ((v & 0x01) << 3); -} - -static inline unsigned bitswap5(unsigned v) -{ - return ((v & 0x10) >> 4) | ((v & 0x08) >> 2) | ((v & 0x04) >> 0) | - ((v & 0x02) << 2) | ((v & 0x01) << 4); -} - -static inline unsigned bitswap6(unsigned v) -{ - return ((v & 0x20) >> 5) | ((v & 0x10) >> 3) | ((v & 0x08) >> 1) | - ((v & 0x04) << 1) | ((v & 0x02) << 3) | ((v & 0x01) << 5); -} - -static unsigned bitswap(unsigned v, unsigned n) -{ - switch (n) { - case 1: - return v; - case 2: - return bitswap2(v); - case 3: - return bitswap3(v); - case 4: - return bitswap4(v); - case 5: - return bitswap5(v); - case 6: - return bitswap6(v); - default: - return 0; - } -} - -/* Generate non-recursive state output from generator state table - * Note that the shift register moves right (i.e. the most recent bit is - * shifted into the register at k-1 bit of the register), which is typical - * textbook representation. The API transition table expects the most recent - * bit in the low order bit, or left shift. A bitswap operation is required - * to accommodate the difference. - */ -static unsigned gen_output(struct vstate *state, int val, - const struct osmo_conv_code *code) -{ - unsigned out, prev; - - prev = bitswap(state->prev[0], code->K - 1); - out = code->next_output[prev][val]; - out = bitswap(out, code->N); - - return out; -} - -/* Populate non-recursive trellis state - * For a given state defined by the k-1 length shift register, find the - * value of the input bit that drove the trellis to that state. Also - * generate the N outputs of the generator polynomial at that state. - */ -static int gen_state_info(uint8_t *val, unsigned reg, - int16_t *output, const struct osmo_conv_code *code) -{ - int i; - unsigned out; - struct vstate state; - - /* Previous '0' state */ - state.state = reg; - state.prev[0] = vstate_lshift(reg, code->K, 0); - state.prev[1] = vstate_lshift(reg, code->K, 1); - - *val = (reg >> (code->K - 2)) & 0x01; - - /* Transition output */ - out = gen_output(&state, *val, code); - - /* Unpack to NRZ */ - for (i = 0; i < code->N; i++) - output[i] = BIT2NRZ(out, i); - - return 0; -} - -/* Generate recursive state output from generator state table */ -static unsigned gen_recursive_output(struct vstate *state, - uint8_t *val, unsigned reg, - const struct osmo_conv_code *code, int pos) -{ - int val0, val1; - unsigned out, prev; - - /* Previous '0' state */ - prev = vstate_lshift(reg, code->K, 0); - prev = bitswap(prev, code->K - 1); - - /* Input value */ - val0 = (reg >> (code->K - 2)) & 0x01; - val1 = (code->next_term_output[prev] >> pos) & 0x01; - *val = val0 == val1 ? 0 : 1; - - /* Wrapper for osmocom state access */ - prev = bitswap(state->prev[0], code->K - 1); - - /* Compute the transition output */ - out = code->next_output[prev][*val]; - out = bitswap(out, code->N); - - return out; -} - -/* Populate recursive trellis state - * The bit position of the systematic bit is not explicitly marked by the - * API, so it must be extracted from the generator table. Otherwise, - * populate the trellis similar to the non-recursive version. - * Non-systematic recursive codes are not supported. - */ -static int gen_recursive_state_info(uint8_t *val, - unsigned reg, int16_t *output, const struct osmo_conv_code *code) -{ - int i, j, pos = -1; - int ns = NUM_STATES(code->K); - unsigned out; - struct vstate state; - - /* Previous '0' and '1' states */ - state.state = reg; - state.prev[0] = vstate_lshift(reg, code->K, 0); - state.prev[1] = vstate_lshift(reg, code->K, 1); - - /* Find recursive bit location */ - for (i = 0; i < code->N; i++) { - for (j = 0; j < ns; j++) { - if ((code->next_output[j][0] >> i) & 0x01) - break; - } - - if (j == ns) { - pos = i; - break; - } - } - - /* Non-systematic recursive code not supported */ - if (pos < 0) - return -EPROTO; - - /* Transition output */ - out = gen_recursive_output(&state, val, reg, code, pos); - - /* Unpack to NRZ */ - for (i = 0; i < code->N; i++) - output[i] = BIT2NRZ(out, i); - - return 0; -} - -/* Release the trellis */ -static void free_trellis(struct vtrellis *trellis) -{ - if (!trellis) - return; - - vdec_free(trellis->outputs); - vdec_free(trellis->sums); - free(trellis->vals); - free(trellis); -} - -/* Allocate and initialize the trellis object - * Initialization consists of generating the outputs and output value of a - * given state. Due to trellis symmetry and anti-symmetry, only one of the - * transition paths is utilized by the butterfly operation in the forward - * recursion, so only one set of N outputs is required per state variable. - */ -static struct vtrellis *generate_trellis(const struct osmo_conv_code *code) -{ - int i, rc = -1; - struct vtrellis *trellis; - int16_t *outputs; - - int ns = NUM_STATES(code->K); - int recursive = conv_code_recursive(code);