From 551ce88b5194db07762192a9102ec3ca372b59d6 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 28 Apr 2026 14:03:40 +0300 Subject: [PATCH 01/12] Audio: MFCC: Improve generic s16 source and sink copy functions This patch updates the data clear and copy functions in mfcc_sink_copy_zero_s16() and mfcc_sink_copy_data_s16() with memset() and memcpy() instead looping sample by sample. The function mfcc_source_copy_s16() is moved to later under CONFIG_FORMAT_S16LE where it should be. There are no changes to the function itself. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_generic.c | 115 ++++++++++++++++------------------ 1 file changed, 53 insertions(+), 62 deletions(-) diff --git a/src/audio/mfcc/mfcc_generic.c b/src/audio/mfcc/mfcc_generic.c index ecc95474326b..c78481ea7a2c 100644 --- a/src/audio/mfcc/mfcc_generic.c +++ b/src/audio/mfcc/mfcc_generic.c @@ -26,53 +26,6 @@ * MFCC algorithm code */ -void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int32_t s; - int16_t *x0; - int16_t *x = audio_stream_get_rptr(source); - int16_t *w = buf->w_ptr; - int copied; - int nmax; - int n1; - int n2; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - */ - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n1 = audio_stream_frames_without_wrap(source, x); - n2 = mfcc_buffer_samples_without_wrap(buf, w); - n = MIN(n1, n2); - n = MIN(n, nmax); - x0 = x + source_channel; - for (i = 0; i < n; i++) { - if (emph->enable) { - /* Q1.15 x Q1.15 -> Q2.30 */ - s = (int32_t)emph->delay * emph->coef + Q_SHIFT_LEFT(*x0, 15, 30); - *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); - emph->delay = *x0; - } else { - *w = *x0; - } - x0 += num_channels; - w++; - } - - x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source)); - w = mfcc_buffer_wrap(buf, w); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = w; -} - void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, int prev_data_length) { @@ -190,24 +143,66 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) #if CONFIG_FORMAT_S16LE +void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + struct audio_stream *source = bsource->data; + int32_t s; + int16_t *x0; + int16_t *x = audio_stream_get_rptr(source); + int16_t *w = buf->w_ptr; + int copied; + int nmax; + int n1; + int n2; + int n; + int i; + int num_channels = audio_stream_get_channels(source); + + /* Copy from source to pre-buffer for FFT. + * The pre-emphasis filter is done in this step. + */ + for (copied = 0; copied < frames; copied += n) { + nmax = frames - copied; + n1 = audio_stream_frames_without_wrap(source, x); + n2 = mfcc_buffer_samples_without_wrap(buf, w); + n = MIN(n1, n2); + n = MIN(n, nmax); + x0 = x + source_channel; + for (i = 0; i < n; i++) { + if (emph->enable) { + /* Q1.15 x Q1.15 -> Q2.30 */ + s = (int32_t)emph->delay * emph->coef + Q_SHIFT_LEFT(*x0, 15, 30); + *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); + emph->delay = *x0; + } else { + *w = *x0; + } + x0 += num_channels; + w++; + } + + x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source)); + w = mfcc_buffer_wrap(buf, w); + } + buf->s_avail += copied; + buf->s_free -= copied; + buf->w_ptr = w; +} + int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, int16_t *w_ptr, int samples) { int copied; int nmax; - int i; int n; for (copied = 0; copied < samples; copied += n) { nmax = samples - copied; n = audio_stream_samples_without_wrap_s16(sink, w_ptr); n = MIN(n, nmax); - for (i = 0; i < n; i++) { - *w_ptr = 0; - w_ptr++; - } - - w_ptr = audio_stream_wrap(sink, w_ptr); + memset(w_ptr, 0, n * sizeof(int16_t)); + w_ptr = audio_stream_wrap(sink, w_ptr + n); } return w_ptr; @@ -218,20 +213,16 @@ int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr { int copied; int nmax; - int i; int n; for (copied = 0; copied < samples; copied += n) { nmax = samples - copied; n = audio_stream_samples_without_wrap_s16(sink, w_ptr); n = MIN(n, nmax); - for (i = 0; i < n; i++) { - *w_ptr = *r_ptr; - r_ptr++; - w_ptr++; - } - - w_ptr = audio_stream_wrap(sink, w_ptr); + /* Not using memcpy_s() due to speed need */ + memcpy(w_ptr, r_ptr, n * sizeof(int16_t)); + w_ptr = audio_stream_wrap(sink, w_ptr + n); + r_ptr += n; } return w_ptr; From 0eb5952496356d56f082ddfdbccde86afe52d530 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 28 Apr 2026 18:11:50 +0300 Subject: [PATCH 02/12] Audio: MFCC: Remove unnecessary data copy HiFi3/4 code The memset() and memcpy() are as fast as HiFi data clear and copy functions so, the functions mfcc_sink_copy_zero_s16() and mfcc_sink_copy_data_s16() can be moved to mfcc_common.c. This change also will help with possible audio features output data format changes in future. The current data format as fake PCM stream may change to compress encode stream type. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_common.c | 40 ++++++++++++++- src/audio/mfcc/mfcc_generic.c | 45 ++--------------- src/audio/mfcc/mfcc_hifi3.c | 68 ++------------------------ src/audio/mfcc/mfcc_hifi4.c | 68 ++------------------------ src/include/sof/audio/mfcc/mfcc_comp.h | 8 +-- 5 files changed, 53 insertions(+), 176 deletions(-) diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 688c7afac9b2..aaf99080a8ba 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: BSD-3-Clause // -// Copyright(c) 2023 Intel Corporation. All rights reserved. +// Copyright(c) 2023-2026 Intel Corporation. // // Author: Andrula Song @@ -140,6 +140,44 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_state *stat } #if CONFIG_FORMAT_S16LE +static int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, + int16_t *w_ptr, int samples) +{ + int copied; + int nmax; + int n; + + for (copied = 0; copied < samples; copied += n) { + nmax = samples - copied; + n = audio_stream_samples_without_wrap_s16(sink, w_ptr); + n = MIN(n, nmax); + memset(w_ptr, 0, n * sizeof(int16_t)); + w_ptr = audio_stream_wrap(sink, w_ptr + n); + } + + return w_ptr; +} + +static int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr, + int samples, int16_t *r_ptr) +{ + int copied; + int nmax; + int n; + + for (copied = 0; copied < samples; copied += n) { + nmax = samples - copied; + n = audio_stream_samples_without_wrap_s16(sink, w_ptr); + n = MIN(n, nmax); + /* Not using memcpy_s() due to speed need */ + memcpy(w_ptr, r_ptr, n * sizeof(int16_t)); + w_ptr = audio_stream_wrap(sink, w_ptr + n); + r_ptr += n; + } + + return w_ptr; +} + void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource, struct output_stream_buffer *bsink, int frames) { diff --git a/src/audio/mfcc/mfcc_generic.c b/src/audio/mfcc/mfcc_generic.c index c78481ea7a2c..92e3efee66f6 100644 --- a/src/audio/mfcc/mfcc_generic.c +++ b/src/audio/mfcc/mfcc_generic.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: BSD-3-Clause // -// Copyright(c) 2022 Intel Corporation. All rights reserved. +// Copyright(c) 2022-2026 Intel Corporation. // // Author: Seppo Ingalsuo @@ -142,7 +142,6 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) } #if CONFIG_FORMAT_S16LE - void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, struct mfcc_pre_emph *emph, int frames, int source_channel) { @@ -189,44 +188,6 @@ void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffe buf->s_free -= copied; buf->w_ptr = w; } - -int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, - int16_t *w_ptr, int samples) -{ - int copied; - int nmax; - int n; - - for (copied = 0; copied < samples; copied += n) { - nmax = samples - copied; - n = audio_stream_samples_without_wrap_s16(sink, w_ptr); - n = MIN(n, nmax); - memset(w_ptr, 0, n * sizeof(int16_t)); - w_ptr = audio_stream_wrap(sink, w_ptr + n); - } - - return w_ptr; -} - -int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr, - int samples, int16_t *r_ptr) -{ - int copied; - int nmax; - int n; - - for (copied = 0; copied < samples; copied += n) { - nmax = samples - copied; - n = audio_stream_samples_without_wrap_s16(sink, w_ptr); - n = MIN(n, nmax); - /* Not using memcpy_s() due to speed need */ - memcpy(w_ptr, r_ptr, n * sizeof(int16_t)); - w_ptr = audio_stream_wrap(sink, w_ptr + n); - r_ptr += n; - } - - return w_ptr; -} - #endif /* CONFIG_FORMAT_S16LE */ -#endif + +#endif /* MFCC_GENERIC */ diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c index b3b5d99967db..1baed9d0c9eb 100644 --- a/src/audio/mfcc/mfcc_hifi3.c +++ b/src/audio/mfcc/mfcc_hifi3.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: BSD-3-Clause // -// Copyright(c) 2023 Intel Corporation. All rights reserved. +// Copyright(c) 2023-2026 Intel Corporation. // // Author: Andrula Song @@ -35,6 +35,7 @@ static inline void set_circular_buf0(const void *start, const void *end) * MFCC algorithm code */ +#if CONFIG_FORMAT_S16LE void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, struct mfcc_pre_emph *emph, int frames, int source_channel) { @@ -92,6 +93,7 @@ void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffe buf->s_free -= copied; buf->w_ptr = (int16_t *)out; } +#endif /* CONFIG_FORMAT_S16LE */ void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, int prev_data_length) @@ -192,6 +194,7 @@ int mfcc_normalize_fft_buffer(struct mfcc_state *state) return shift; } #endif + void mfcc_apply_window(struct mfcc_state *state, int input_shift) { struct mfcc_fft *fft = &state->fft; @@ -229,65 +232,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) #endif } -#if CONFIG_FORMAT_S16LE - -int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, - int16_t *w_ptr, int samples) -{ - int i; - int n = samples >> 2; - int m = samples & 0x03; - ae_int16x4 *out = (ae_int16x4 *)w_ptr; - const int inc = sizeof(ae_int16); - ae_valign outu = AE_ZALIGN64(); - ae_int16x4 zero = AE_ZERO16(); - - set_circular_buf0(sink->addr, sink->end_addr); - - for (i = 0; i < n; i++) - AE_SA16X4_IC(zero, outu, out); - - AE_SA64POS_FP(outu, out); - /* process the left samples that less than 4 - * one by one to avoid memory access overrun - */ - for (i = 0; i < m ; i++) - AE_S16_0_XC(zero, (ae_int16 *)out, inc); - - return (int16_t *)out; -} - -int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr, - int samples, int16_t *r_ptr) -{ - int i; - int n = samples >> 2; - int m = samples & 0x03; - ae_int16x4 *out = (ae_int16x4 *)w_ptr; - ae_int16x4 *in = (ae_int16x4 *)r_ptr; - ae_valign outu = AE_ZALIGN64(); - ae_valign inu = AE_ZALIGN64(); - const int inc = sizeof(ae_int16); - ae_int16x4 in_sample; - - set_circular_buf0(sink->addr, sink->end_addr); - - inu = AE_LA64_PP(in); - for (i = 0; i < n; i++) { - AE_LA16X4_IP(in_sample, inu, in); - AE_SA16X4_IC(in_sample, outu, out); - } - AE_SA64POS_FP(outu, out); - /* process the left samples that less than 4 - * one by one to avoid memory access overrun - */ - for (i = 0; i < m ; i++) { - AE_L16_XP(in_sample, (ae_int16 *)in, inc); - AE_S16_0_XC(in_sample, (ae_int16 *)out, inc); - } - - return (int16_t *)out; -} - -#endif /* CONFIG_FORMAT_S16LE */ -#endif +#endif /* MFCC_HIFI3 */ diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c index 60a4de62ec23..c5783dcc90ec 100644 --- a/src/audio/mfcc/mfcc_hifi4.c +++ b/src/audio/mfcc/mfcc_hifi4.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: BSD-3-Clause // -// Copyright(c) 2023 Intel Corporation. All rights reserved. +// Copyright(c) 2023-2026 Intel Corporation. // // Author: Andrula Song @@ -41,6 +41,8 @@ static inline void set_circular_buf1(const void *start, const void *end) /* * MFCC algorithm code */ + +#if CONFIG_FORMAT_S16LE void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, struct mfcc_pre_emph *emph, int frames, int source_channel) { @@ -87,6 +89,7 @@ void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffe buf->s_free -= frames; buf->w_ptr = (int16_t *)out; } +#endif /* CONFIG_FORMAT_S16LE */ void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, int prev_data_length) @@ -225,65 +228,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) #endif } -#if CONFIG_FORMAT_S16LE - -int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, - int16_t *w_ptr, int samples) -{ - int i; - int n = samples >> 2; - int m = samples & 0x03; - ae_int16x4 *out = (ae_int16x4 *)w_ptr; - const int inc = sizeof(ae_int16); - ae_valign outu = AE_ZALIGN64(); - ae_int16x4 zero = AE_ZERO16(); - - set_circular_buf0(sink->addr, sink->end_addr); - - for (i = 0; i < n; i++) - AE_SA16X4_IC(zero, outu, out); - - AE_SA64POS_FP(outu, out); - /* process the left samples that less than 4 - * one by one to avoid memory access overrun - */ - for (i = 0; i < m ; i++) - AE_S16_0_XC(zero, (ae_int16 *)out, inc); - - return (int16_t *)out; -} - -int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr, - int samples, int16_t *r_ptr) -{ - int i; - int n = samples >> 2; - int m = samples & 0x03; - ae_int16x4 *out = (ae_int16x4 *)w_ptr; - ae_int16x4 *in = (ae_int16x4 *)r_ptr; - ae_valign outu = AE_ZALIGN64(); - ae_valign inu = AE_ZALIGN64(); - const int inc = sizeof(ae_int16); - ae_int16x4 in_sample; - - set_circular_buf0(sink->addr, sink->end_addr); - - inu = AE_LA64_PP(in); - for (i = 0; i < n; i++) { - AE_LA16X4_IP(in_sample, inu, in); - AE_SA16X4_IC(in_sample, outu, out); - } - AE_SA64POS_FP(outu, out); - /* process the left samples that less than 4 - * one by one to avoid memory access overrun - */ - for (i = 0; i < m ; i++) { - AE_L16_XP(in_sample, (ae_int16 *)in, inc); - AE_S16_0_XC(in_sample, (ae_int16 *)out, inc); - } - - return (int16_t *)out; -} - -#endif /* CONFIG_FORMAT_S16LE */ -#endif +#endif /* MFCC_HIFI4 */ diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index 7323428ec37d..798643d2389b 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * - * Copyright(c) 2022 Intel Corporation. All rights reserved. + * Copyright(c) 2022-2026 Intel Corporation. * * Author: Seppo Ingalsuo */ @@ -175,12 +175,6 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift); #if CONFIG_FORMAT_S16LE -int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, - int16_t *w_ptr, int samples); - -int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr, - int samples, int16_t *r_ptr); - void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource, struct output_stream_buffer *bsink, int frames); #endif From 856d3fce1ba51b36a5207b99003992e96129ad55 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Mon, 27 Apr 2026 19:49:11 +0300 Subject: [PATCH 03/12] Audio: MFCC: Add S24 and S32 format support Add S24_4LE and S32_LE processing functions for MFCC component. The new format variants convert input samples to internal 16-bit representation for FFT processing and expand cepstral output back to the sink format. Implementations are added for generic, HiFi3, and HiFi4 architectures. The source copy functions handle pre-emphasis filtering with the format conversion. The sink copy functions write 16-bit cepstral coefficients expanded to the 32-bit container format. The MFCC magic marker is written directly as a raw 32-bit value without format conversion. The function map in mfcc.c is updated to wire the new processing functions for S24_4LE and S32_LE formats. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc.c | 6 +- src/audio/mfcc/mfcc_common.c | 104 +++++++++++++++++++++ src/audio/mfcc/mfcc_generic.c | 108 ++++++++++++++++++++++ src/audio/mfcc/mfcc_hifi3.c | 120 +++++++++++++++++++++++++ src/audio/mfcc/mfcc_hifi4.c | 102 +++++++++++++++++++++ src/include/sof/audio/mfcc/mfcc_comp.h | 27 ++++-- 6 files changed, 458 insertions(+), 9 deletions(-) diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c index 9874edea4be5..656e3d9b7bf7 100644 --- a/src/audio/mfcc/mfcc.c +++ b/src/audio/mfcc/mfcc.c @@ -38,13 +38,13 @@ SOF_DEFINE_REG_UUID(mfcc); __cold_rodata const struct mfcc_func_map mfcc_fm[] = { #if CONFIG_FORMAT_S16LE - {SOF_IPC_FRAME_S16_LE, mfcc_s16_default}, + {SOF_IPC_FRAME_S16_LE, mfcc_s16_default}, #endif /* CONFIG_FORMAT_S16LE */ #if CONFIG_FORMAT_S24LE - {SOF_IPC_FRAME_S24_4LE, NULL}, + {SOF_IPC_FRAME_S24_4LE, mfcc_s24_default}, #endif /* CONFIG_FORMAT_S24LE */ #if CONFIG_FORMAT_S32LE - {SOF_IPC_FRAME_S32_LE, NULL}, + {SOF_IPC_FRAME_S32_LE, mfcc_s32_default}, #endif /* CONFIG_FORMAT_S32LE */ }; diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index aaf99080a8ba..2ca6dcb6ff19 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -214,8 +214,112 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer } #endif /* CONFIG_FORMAT_S16LE */ +#if CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE +static int32_t *mfcc_sink_copy_zero_s32(const struct audio_stream *sink, int32_t *w_ptr, + int samples) +{ + int copied; + int nmax; + int n; + + for (copied = 0; copied < samples; copied += n) { + nmax = samples - copied; + n = audio_stream_samples_without_wrap_s32(sink, w_ptr); + n = MIN(n, nmax); + memset(w_ptr, 0, n * sizeof(int32_t)); + w_ptr = audio_stream_wrap(sink, w_ptr + n); + } + + return w_ptr; +} + +static int32_t *mfcc_sink_copy_data_s32(const struct audio_stream *sink, int32_t *w_ptr, + int samples, int32_t *r_ptr) +{ + int copied; + int nmax; + int n; + + for (copied = 0; copied < samples; copied += n) { + nmax = samples - copied; + n = audio_stream_samples_without_wrap_s32(sink, w_ptr); + n = MIN(n, nmax); + /* Not using memcpy_s() due to speed need */ + memcpy(w_ptr, r_ptr, n * sizeof(int32_t)); + w_ptr = audio_stream_wrap(sink, w_ptr + n); + r_ptr += n; + } + + return w_ptr; +} +#endif /* CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE */ + #if CONFIG_FORMAT_S24LE +void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer *bsource, + struct output_stream_buffer *bsink, int frames) +{ + struct audio_stream *sink = bsink->data; + struct mfcc_comp_data *cd = module_get_private_data(mod); + struct mfcc_state *state = &cd->state; + struct mfcc_buffer *buf = &cd->state.buf; + uint32_t magic = MFCC_MAGIC; + int32_t *w_ptr = audio_stream_get_wptr(sink); + const int num_magic = 1; /* one int32_t word for magic */ + int num_ceps; + int ceps_s32; + int zero_samples; + + /* Get samples from source buffer */ + mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel); + + /* Run STFT and processing after FFT */ + num_ceps = mfcc_stft_process(mod->dev, state); + + /* Copy data to sink. Pack int16_t cepstral data into int32_t samples. */ + zero_samples = frames * audio_stream_get_channels(sink); + if (num_ceps > 0) { + ceps_s32 = (num_ceps + 1) / 2; + zero_samples -= ceps_s32 + num_magic; + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic); + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, ceps_s32, + (int32_t *)state->cepstral_coef->data); + } + + w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, zero_samples); +} #endif /* CONFIG_FORMAT_S24LE */ #if CONFIG_FORMAT_S32LE +void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer *bsource, + struct output_stream_buffer *bsink, int frames) +{ + struct audio_stream *sink = bsink->data; + struct mfcc_comp_data *cd = module_get_private_data(mod); + struct mfcc_state *state = &cd->state; + struct mfcc_buffer *buf = &cd->state.buf; + uint32_t magic = MFCC_MAGIC; + int32_t *w_ptr = audio_stream_get_wptr(sink); + const int num_magic = 1; /* one int32_t word for magic */ + int num_ceps; + int ceps_s32; + int zero_samples; + + /* Get samples from source buffer */ + mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel); + + /* Run STFT and processing after FFT */ + num_ceps = mfcc_stft_process(mod->dev, state); + + /* Copy data to sink. Pack int16_t cepstral data into int32_t samples. */ + zero_samples = frames * audio_stream_get_channels(sink); + if (num_ceps > 0) { + ceps_s32 = (num_ceps + 1) / 2; + zero_samples -= ceps_s32 + num_magic; + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic); + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, ceps_s32, + (int32_t *)state->cepstral_coef->data); + } + + w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, zero_samples); +} #endif /* CONFIG_FORMAT_S32LE */ diff --git a/src/audio/mfcc/mfcc_generic.c b/src/audio/mfcc/mfcc_generic.c index 92e3efee66f6..48d2b2e88997 100644 --- a/src/audio/mfcc/mfcc_generic.c +++ b/src/audio/mfcc/mfcc_generic.c @@ -190,4 +190,112 @@ void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffe } #endif /* CONFIG_FORMAT_S16LE */ +#if CONFIG_FORMAT_S24LE + +void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + struct audio_stream *source = bsource->data; + int32_t tmp, s; + int32_t *x0; + int32_t *x = audio_stream_get_rptr(source); + int16_t *w = buf->w_ptr; + int copied; + int nmax; + int n1; + int n2; + int n; + int i; + int num_channels = audio_stream_get_channels(source); + + /* Copy from source to pre-buffer for FFT. + * The pre-emphasis filter is done in this step. + * S24_4LE data is in 32-bit container, shift left by 8 to Q1.31, + * then convert to Q1.15 with rounding. + */ + for (copied = 0; copied < frames; copied += n) { + nmax = frames - copied; + n1 = audio_stream_frames_without_wrap(source, x); + n2 = mfcc_buffer_samples_without_wrap(buf, w); + n = MIN(n1, n2); + n = MIN(n, nmax); + x0 = x + source_channel; + for (i = 0; i < n; i++) { + if (emph->enable) { + /* Convert to Q1.31, ignore highest byte */ + s = (int32_t)((uint32_t)*x0 << 8); + /* Q1.15 x Q1.15 -> Q2.30 */ + tmp = (int32_t)emph->delay * emph->coef + Q_SHIFT(s, 31, 30); + *w = sat_int16(Q_SHIFT_RND(tmp, 30, 15)); + emph->delay = sat_int16(Q_SHIFT_RND(s, 31, 15)); + } else { + /* Convert to Q1.31, ignore highest byte */ + s = (int32_t)((uint32_t)*x0 << 8); + *w = sat_int16(Q_SHIFT_RND(s, 31, 15)); + } + x0 += num_channels; + w++; + } + + x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source)); + w = mfcc_buffer_wrap(buf, w); + } + buf->s_avail += copied; + buf->s_free -= copied; + buf->w_ptr = w; +} + +#endif /* CONFIG_FORMAT_S24LE */ + +#if CONFIG_FORMAT_S32LE + +void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + struct audio_stream *source = bsource->data; + int32_t s; + int32_t *x0; + int32_t *x = audio_stream_get_rptr(source); + int16_t *w = buf->w_ptr; + int copied; + int nmax; + int n1; + int n2; + int n; + int i; + int num_channels = audio_stream_get_channels(source); + + /* Copy from source to pre-buffer for FFT. + * The pre-emphasis filter is done in this step. + * S32 data is in 32-bit container, shift right by 16 to get 16-bit. + */ + for (copied = 0; copied < frames; copied += n) { + nmax = frames - copied; + n1 = audio_stream_frames_without_wrap(source, x); + n2 = mfcc_buffer_samples_without_wrap(buf, w); + n = MIN(n1, n2); + n = MIN(n, nmax); + x0 = x + source_channel; + for (i = 0; i < n; i++) { + if (emph->enable) { + /* Q1.15 x Q1.15 -> Q2.30 */ + s = (int32_t)emph->delay * emph->coef + Q_SHIFT(*x0, 31, 30); + *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); + emph->delay = sat_int16(Q_SHIFT_RND(*x0, 31, 15)); + } else { + *w = sat_int16(Q_SHIFT_RND(*x0, 31, 15)); + } + x0 += num_channels; + w++; + } + + x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source)); + w = mfcc_buffer_wrap(buf, w); + } + buf->s_avail += copied; + buf->s_free -= copied; + buf->w_ptr = w; +} +#endif /* CONFIG_FORMAT_S32LE */ + #endif /* MFCC_GENERIC */ diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c index 1baed9d0c9eb..153048d67bf7 100644 --- a/src/audio/mfcc/mfcc_hifi3.c +++ b/src/audio/mfcc/mfcc_hifi3.c @@ -232,4 +232,124 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) #endif } +#if CONFIG_FORMAT_S24LE +void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + struct audio_stream *source = bsource->data; + int copied; + int nmax; + int n; + int i; + int num_channels = audio_stream_get_channels(source); + ae_int32 *in; + ae_int32 *x = (ae_int32 *)audio_stream_get_rptr(source); + ae_int16 *out = (ae_int16 *)buf->w_ptr; + ae_int32x2 sample32; + ae_int16x4 sample; + ae_int32x2 temp; + ae_int16x4 coef = emph->coef; + ae_int16x4 delay; + const int in_inc = sizeof(ae_int32) * num_channels; + + for (copied = 0; copied < frames; copied += n) { + nmax = frames - copied; + n = audio_stream_frames_without_wrap(source, x); + n = MIN(n, nmax); + nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out); + n = MIN(n, nmax); + in = x + source_channel; + if (emph->enable) { + delay = emph->delay; + for (i = 0; i < n; i++) { + AE_L32_XP(sample32, in, in_inc); + /* S24_4LE: shift right by 8 to get 16-bit, then convert */ + sample32 = AE_SRAI32(sample32, 8); + sample = AE_SAT16X4(sample32, sample32); + /* Q1.15 -> Q1.31 */ + temp = AE_CVT32X2F16_10(sample); + AE_MULAF16SS_00(temp, delay, coef); + delay = sample; + sample = AE_ROUND16X4F32SSYM(temp, temp); + AE_S16_0_IP(sample, out, 2); + } + emph->delay = delay; + } else { + for (i = 0; i < n; i++) { + AE_L32_XP(sample32, in, in_inc); + sample32 = AE_SRAI32(sample32, 8); + sample = AE_SAT16X4(sample32, sample32); + AE_S16_0_IP(sample, out, 2); + } + } + + x = audio_stream_wrap(source, x + n * num_channels); + out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out); + } + buf->s_avail += copied; + buf->s_free -= copied; + buf->w_ptr = (int16_t *)out; +} +#endif /* CONFIG_FORMAT_S24LE */ + +#if CONFIG_FORMAT_S32LE +void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + struct audio_stream *source = bsource->data; + int copied; + int nmax; + int n; + int i; + int num_channels = audio_stream_get_channels(source); + ae_int32 *in; + ae_int32 *x = (ae_int32 *)audio_stream_get_rptr(source); + ae_int16 *out = (ae_int16 *)buf->w_ptr; + ae_int32x2 sample32; + ae_int16x4 sample; + ae_int32x2 temp; + ae_int16x4 coef = emph->coef; + ae_int16x4 delay; + const int in_inc = sizeof(ae_int32) * num_channels; + + for (copied = 0; copied < frames; copied += n) { + nmax = frames - copied; + n = audio_stream_frames_without_wrap(source, x); + n = MIN(n, nmax); + nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out); + n = MIN(n, nmax); + in = x + source_channel; + if (emph->enable) { + delay = emph->delay; + for (i = 0; i < n; i++) { + AE_L32_XP(sample32, in, in_inc); + /* S32: shift right by 16 to get 16-bit */ + sample32 = AE_SRAI32(sample32, 16); + sample = AE_SAT16X4(sample32, sample32); + /* Q1.15 -> Q1.31 */ + temp = AE_CVT32X2F16_10(sample); + AE_MULAF16SS_00(temp, delay, coef); + delay = sample; + sample = AE_ROUND16X4F32SSYM(temp, temp); + AE_S16_0_IP(sample, out, 2); + } + emph->delay = delay; + } else { + for (i = 0; i < n; i++) { + AE_L32_XP(sample32, in, in_inc); + sample32 = AE_SRAI32(sample32, 16); + sample = AE_SAT16X4(sample32, sample32); + AE_S16_0_IP(sample, out, 2); + } + } + + x = audio_stream_wrap(source, x + n * num_channels); + out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out); + } + buf->s_avail += copied; + buf->s_free -= copied; + buf->w_ptr = (int16_t *)out; +} +#endif /* CONFIG_FORMAT_S32LE */ + #endif /* MFCC_HIFI3 */ diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c index c5783dcc90ec..c9bd59ada18b 100644 --- a/src/audio/mfcc/mfcc_hifi4.c +++ b/src/audio/mfcc/mfcc_hifi4.c @@ -228,4 +228,106 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) #endif } +#if CONFIG_FORMAT_S24LE +void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + struct audio_stream *source = bsource->data; + int num_channels = audio_stream_get_channels(source); + ae_int32 *in = (ae_int32 *)source->r_ptr + source_channel; + ae_int16 *out = (ae_int16 *)buf->w_ptr; + ae_int32x2 sample32; + ae_int16x4 sample; + ae_int32x2 temp; + ae_int16x4 coef; + ae_int16x4 delay; + const int in_inc = sizeof(ae_int32) * num_channels; + const int out_inc = sizeof(ae_int16); + int i; + + set_circular_buf1(buf->addr, buf->end_addr); + set_circular_buf0(source->addr, source->end_addr); + + if (emph->enable) { + delay = emph->delay; + coef = emph->coef; + for (i = 0; i < frames; i++) { + AE_L32_XC(sample32, in, in_inc); + /* S24_4LE: shift right by 8 to get 16-bit */ + sample32 = AE_SRAI32(sample32, 8); + sample = AE_SAT16X4(sample32, sample32); + /* Q1.15 -> Q1.31 */ + temp = AE_CVT32X2F16_10(sample); + AE_MULAF16SS_00(temp, delay, coef); + delay = sample; + sample = AE_ROUND16X4F32SSYM(temp, temp); + AE_S16_0_XC1(sample, out, out_inc); + } + emph->delay = delay; + } else { + for (i = 0; i < frames; i++) { + AE_L32_XC(sample32, in, in_inc); + sample32 = AE_SRAI32(sample32, 8); + sample = AE_SAT16X4(sample32, sample32); + AE_S16_0_XC1(sample, out, out_inc); + } + } + + buf->s_avail += frames; + buf->s_free -= frames; + buf->w_ptr = (int16_t *)out; +} +#endif /* CONFIG_FORMAT_S24LE */ + +#if CONFIG_FORMAT_S32LE +void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + struct audio_stream *source = bsource->data; + int num_channels = audio_stream_get_channels(source); + ae_int32 *in = (ae_int32 *)source->r_ptr + source_channel; + ae_int16 *out = (ae_int16 *)buf->w_ptr; + ae_int32x2 sample32; + ae_int16x4 sample; + ae_int32x2 temp; + ae_int16x4 coef; + ae_int16x4 delay; + const int in_inc = sizeof(ae_int32) * num_channels; + const int out_inc = sizeof(ae_int16); + int i; + + set_circular_buf1(buf->addr, buf->end_addr); + set_circular_buf0(source->addr, source->end_addr); + + if (emph->enable) { + delay = emph->delay; + coef = emph->coef; + for (i = 0; i < frames; i++) { + AE_L32_XC(sample32, in, in_inc); + /* S32: shift right by 16 to get 16-bit */ + sample32 = AE_SRAI32(sample32, 16); + sample = AE_SAT16X4(sample32, sample32); + /* Q1.15 -> Q1.31 */ + temp = AE_CVT32X2F16_10(sample); + AE_MULAF16SS_00(temp, delay, coef); + delay = sample; + sample = AE_ROUND16X4F32SSYM(temp, temp); + AE_S16_0_XC1(sample, out, out_inc); + } + emph->delay = delay; + } else { + for (i = 0; i < frames; i++) { + AE_L32_XC(sample32, in, in_inc); + sample32 = AE_SRAI32(sample32, 16); + sample = AE_SAT16X4(sample32, sample32); + AE_S16_0_XC1(sample, out, out_inc); + } + } + + buf->s_avail += frames; + buf->s_free -= frames; + buf->w_ptr = (int16_t *)out; +} +#endif /* CONFIG_FORMAT_S32LE */ + #endif /* MFCC_HIFI4 */ diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index 798643d2389b..e42be5b3ca0f 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -156,12 +156,6 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int rate, int chan void mfcc_free_buffers(struct processing_module *mod); -void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames); - -void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel); - void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, int prev_data_length); @@ -175,10 +169,31 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift); #if CONFIG_FORMAT_S16LE +void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel); + void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource, struct output_stream_buffer *bsink, int frames); #endif +#if CONFIG_FORMAT_S24LE + +void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel); + +void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer *bsource, + struct output_stream_buffer *bsink, int frames); +#endif + +#if CONFIG_FORMAT_S32LE + +void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel); + +void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer *bsource, + struct output_stream_buffer *bsink, int frames); +#endif + #ifdef UNIT_TEST void sys_comp_module_mfcc_interface_init(void); #endif From 49d80f14b9ed6de8d7e8ec974f0a715b4e33c67d Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Mon, 4 May 2026 15:58:58 +0300 Subject: [PATCH 04/12] Audio: MFCC: Improve check for input channel select The configuration blob uses value -1 for for input channel select with mono format. This patch adds an error if the -1 is used for other than mono input stream. The low-information comp_info() trace print is moved a to better error message. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_setup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index dded450673ad..586103bbf3fb 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -139,10 +139,9 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i return -EINVAL; } - comp_info(dev, "source_channel = %d, stream_channels = %d", - config->channel, channels); - if (config->channel >= channels) { - comp_err(dev, "Illegal channel"); + if (config->channel >= channels || (config->channel < 0 && channels != 1)) { + comp_err(dev, "Illegal source_channel %d for stream channels %d", config->channel, + channels); return -EINVAL; } From 57731b4dd1454ab45da46db70f9fbc3ff80d4f97 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Fri, 24 Apr 2026 18:17:47 +0300 Subject: [PATCH 05/12] Audio: MFCC: Add Mel log spectra output mode when num_ceps is zero Add a mode where cepstral coefficients are not computed and the Mel frequency logarithm values are passed directly to the sink buffer. The mode is activated when sof_mfcc_config member num_ceps is set to zero. When num_ceps is zero: - DCT matrix and cepstral lifter are not allocated or initialized - The Mel log spectra (num_mel_bins values) are output to the sink instead of cepstral coefficients - A mel_only flag is added to mfcc_state for runtime path selection This is useful for applications that need Mel spectrogram features without the DCT transform, such as some neural network audio front-ends. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_common.c | 36 ++++++++++++----- src/audio/mfcc/mfcc_setup.c | 56 +++++++++++++++++--------- src/include/sof/audio/mfcc/mfcc_comp.h | 1 + 3 files changed, 63 insertions(+), 30 deletions(-) diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 2ca6dcb6ff19..b80cc0b6b6f3 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -119,16 +119,22 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_state *stat state->mel_spectra->data, mel_scale_shift); #endif - /* Multiply Mel spectra with DCT matrix to get cepstral coefficients */ - mat_init_16b(state->cepstral_coef, 1, state->dct.num_out, 7); /* Q8.7 */ - mat_multiply(state->mel_spectra, state->dct.matrix, state->cepstral_coef); - - /* Apply cepstral lifter */ - if (state->lifter.cepstral_lifter != 0) - mat_multiply_elementwise(state->cepstral_coef, state->lifter.matrix, - state->cepstral_coef); - - cc_count += state->dct.num_out; + if (state->mel_only) { + /* In Mel-only mode output Mel log spectra directly */ + cc_count += state->dct.num_in; + } else { + /* Multiply Mel spectra with DCT matrix to get cepstral coefficients */ + mat_init_16b(state->cepstral_coef, 1, state->dct.num_out, 7); /* Q8.7 */ + mat_multiply(state->mel_spectra, state->dct.matrix, state->cepstral_coef); + + /* Apply cepstral lifter */ + if (state->lifter.cepstral_lifter != 0) { + mat_multiply_elementwise(state->cepstral_coef, state->lifter.matrix, + state->cepstral_coef); + } + + cc_count += state->dct.num_out; + } /* Output to sink buffer */ } @@ -205,9 +211,17 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer */ zero_samples = frames * audio_stream_get_channels(sink); if (num_ceps > 0) { + int16_t *out_data; + + if (state->mel_only) { + out_data = state->mel_spectra->data; + } else { + out_data = state->cepstral_coef->data; + } + zero_samples -= num_ceps + num_magic; w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic); - w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_ceps, state->cepstral_coef->data); + w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_ceps, out_data); } w_ptr = mfcc_sink_copy_zero_s16(sink, w_ptr, zero_samples); diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 586103bbf3fb..7cf6f26d65ec 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -248,23 +248,37 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i goto free_fft_out; } - /* Setup DCT */ - dct->num_in = config->num_mel_bins; - dct->num_out = config->num_ceps; - dct->type = (enum dct_type)config->dct; - dct->ortho = true; - ret = mod_dct_initialize_16(mod, dct); - if (ret < 0) { - comp_err(dev, "Failed DCT init"); - goto free_melfb_data; - } - - state->lifter.num_ceps = config->num_ceps; - state->lifter.cepstral_lifter = config->cepstral_lifter; /* Q7.9 max 64.0*/ - ret = mfcc_get_cepstral_lifter(mod, &state->lifter); - if (ret < 0) { - comp_err(dev, "Failed cepstral lifter"); - goto free_dct_matrix; + /* Setup DCT and cepstral lifter only when num_ceps > 0. + * When num_ceps is zero, skip DCT/lifter and output Mel + * log spectra directly. + */ + if (config->num_ceps > 0) { + dct->num_in = config->num_mel_bins; + dct->num_out = config->num_ceps; + dct->type = (enum dct_type)config->dct; + dct->ortho = true; + ret = mod_dct_initialize_16(mod, dct); + if (ret < 0) { + comp_err(dev, "Failed DCT init"); + goto free_melfb_data; + } + + state->lifter.num_ceps = config->num_ceps; + state->lifter.cepstral_lifter = config->cepstral_lifter; /* Q7.9 max 64.0*/ + ret = mfcc_get_cepstral_lifter(mod, &state->lifter); + if (ret < 0) { + comp_err(dev, "Failed cepstral lifter"); + goto free_dct_matrix; + } + + state->mel_only = false; + } else { + comp_info(dev, "num_ceps is 0, Mel log spectra output mode"); + dct->num_in = config->num_mel_bins; + dct->num_out = 0; + dct->matrix = NULL; + state->lifter.matrix = NULL; + state->mel_only = true; } /* Scratch overlay during runtime @@ -288,8 +302,12 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i /* Use FFT buffer as scratch for later computed data */ state->power_spectra = (int32_t *)&fft->fft_buf[0]; state->mel_spectra = (struct mat_matrix_16b *)&fft->fft_out[0]; - state->cepstral_coef = (struct mat_matrix_16b *) - &state->mel_spectra->data[state->dct.num_in]; + if (!state->mel_only) { + state->cepstral_coef = + (struct mat_matrix_16b *)&state->mel_spectra->data[state->dct.num_in]; + } else { + state->cepstral_coef = NULL; + } /* Set initial state for STFT */ state->waiting_fill = true; diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index e42be5b3ca0f..37cf56c2ad98 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -125,6 +125,7 @@ struct mfcc_state { int low_freq; int high_freq; int sample_rate; + bool mel_only; /**< When true, output Mel spectra instead of cepstral coefficients */ bool waiting_fill; /**< booleans */ bool prev_samples_valid; size_t sample_buffers_size; /**< bytes */ From c88c72d39cd0c7cb45b2f02a9339d7e393d5ab56 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Thu, 30 Apr 2026 19:56:21 +0300 Subject: [PATCH 06/12] Audio: MFCC: Allow output data to span over single period This change allows to have more than e.g. 30 ceps or Mel values plus magic sync value number in a single stereo 16 kHz 16 bit period. As much data can be packed as the FFT hop size and used sink format allows. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_common.c | 137 ++++++++++++++++++------- src/audio/mfcc/mfcc_setup.c | 25 ++++- src/include/sof/audio/mfcc/mfcc_comp.h | 3 + 3 files changed, 128 insertions(+), 37 deletions(-) diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index b80cc0b6b6f3..986b13713199 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -193,38 +193,49 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer struct mfcc_buffer *buf = &cd->state.buf; uint32_t magic = MFCC_MAGIC; int16_t *w_ptr = audio_stream_get_wptr(sink); - // int num_magic = sizeof(magic) / sizeof(int16_t); const int num_magic = 2; int num_ceps; - int zero_samples; + int sink_samples; + int to_copy; /* Get samples from source buffer */ mfcc_source_copy_s16(bsource, buf, &state->emph, frames, state->source_channel); - /* Run STFT and processing after FFT: Mel auditory filter and DCT. The sink - * buffer is updated during STDF processing. - */ + /* Run STFT and processing after FFT: Mel auditory filter and DCT. */ num_ceps = mfcc_stft_process(mod->dev, state); - /* Done, copy data to sink. This works only if the period has room for magic (2) - * plus num_ceps int16_t samples. TODO: split ceps over multiple periods. - */ - zero_samples = frames * audio_stream_get_channels(sink); + /* If new output produced, set up pointer into scratch data and mark magic pending */ if (num_ceps > 0) { - int16_t *out_data; + if (state->mel_only) + state->out_data_ptr = state->mel_spectra->data; + else + state->out_data_ptr = state->cepstral_coef->data; - if (state->mel_only) { - out_data = state->mel_spectra->data; - } else { - out_data = state->cepstral_coef->data; - } + state->out_remain = num_ceps; + state->magic_pending = true; + } - zero_samples -= num_ceps + num_magic; + /* Write to sink, limited by period size */ + sink_samples = frames * audio_stream_get_channels(sink); + + /* Write magic word first if pending */ + if (state->magic_pending && sink_samples >= num_magic) { w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic); - w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_ceps, out_data); + sink_samples -= num_magic; + state->magic_pending = false; + } + + /* Write cepstral/mel data from scratch buffer */ + to_copy = MIN(state->out_remain, sink_samples); + if (to_copy > 0) { + w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, to_copy, state->out_data_ptr); + state->out_data_ptr += to_copy; + state->out_remain -= to_copy; + sink_samples -= to_copy; } - w_ptr = mfcc_sink_copy_zero_s16(sink, w_ptr, zero_samples); + /* Zero-fill remaining sink samples */ + w_ptr = mfcc_sink_copy_zero_s16(sink, w_ptr, sink_samples); } #endif /* CONFIG_FORMAT_S16LE */ @@ -280,8 +291,9 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer int32_t *w_ptr = audio_stream_get_wptr(sink); const int num_magic = 1; /* one int32_t word for magic */ int num_ceps; - int ceps_s32; - int zero_samples; + int sink_samples; + int remain_s32; + int to_copy; /* Get samples from source buffer */ mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel); @@ -289,17 +301,43 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer /* Run STFT and processing after FFT */ num_ceps = mfcc_stft_process(mod->dev, state); - /* Copy data to sink. Pack int16_t cepstral data into int32_t samples. */ - zero_samples = frames * audio_stream_get_channels(sink); + /* If new output produced, set up pointer into scratch data */ if (num_ceps > 0) { - ceps_s32 = (num_ceps + 1) / 2; - zero_samples -= ceps_s32 + num_magic; + if (state->mel_only) + state->out_data_ptr = state->mel_spectra->data; + else + state->out_data_ptr = state->cepstral_coef->data; + + state->out_remain = num_ceps; + state->magic_pending = true; + } + + /* Write to sink, limited by period size */ + sink_samples = frames * audio_stream_get_channels(sink); + + /* Write magic word first if pending */ + if (state->magic_pending && sink_samples >= num_magic) { w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic); - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, ceps_s32, - (int32_t *)state->cepstral_coef->data); + sink_samples -= num_magic; + state->magic_pending = false; } - w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, zero_samples); + /* Write cepstral/mel data packed as int32_t from scratch buffer */ + remain_s32 = (state->out_remain + 1) / 2; + to_copy = MIN(remain_s32, sink_samples); + if (to_copy > 0) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, + (int32_t *)state->out_data_ptr); + state->out_data_ptr += to_copy * 2; + state->out_remain -= to_copy * 2; + if (state->out_remain < 0) + state->out_remain = 0; + + sink_samples -= to_copy; + } + + /* Zero-fill remaining sink samples */ + w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples); } #endif /* CONFIG_FORMAT_S24LE */ @@ -315,8 +353,9 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer int32_t *w_ptr = audio_stream_get_wptr(sink); const int num_magic = 1; /* one int32_t word for magic */ int num_ceps; - int ceps_s32; - int zero_samples; + int sink_samples; + int remain_s32; + int to_copy; /* Get samples from source buffer */ mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel); @@ -324,16 +363,42 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer /* Run STFT and processing after FFT */ num_ceps = mfcc_stft_process(mod->dev, state); - /* Copy data to sink. Pack int16_t cepstral data into int32_t samples. */ - zero_samples = frames * audio_stream_get_channels(sink); + /* If new output produced, set up pointer into scratch data */ if (num_ceps > 0) { - ceps_s32 = (num_ceps + 1) / 2; - zero_samples -= ceps_s32 + num_magic; + if (state->mel_only) + state->out_data_ptr = state->mel_spectra->data; + else + state->out_data_ptr = state->cepstral_coef->data; + + state->out_remain = num_ceps; + state->magic_pending = true; + } + + /* Write to sink, limited by period size */ + sink_samples = frames * audio_stream_get_channels(sink); + + /* Write magic word first if pending */ + if (state->magic_pending && sink_samples >= num_magic) { w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic); - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, ceps_s32, - (int32_t *)state->cepstral_coef->data); + sink_samples -= num_magic; + state->magic_pending = false; + } + + /* Write cepstral/mel data packed as int32_t from scratch buffer */ + remain_s32 = (state->out_remain + 1) / 2; + to_copy = MIN(remain_s32, sink_samples); + if (to_copy > 0) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, + (int32_t *)state->out_data_ptr); + state->out_data_ptr += to_copy * 2; + state->out_remain -= to_copy * 2; + if (state->out_remain < 0) + state->out_remain = 0; + + sink_samples -= to_copy; } - w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, zero_samples); + /* Zero-fill remaining sink samples */ + w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples); } #endif /* CONFIG_FORMAT_S32LE */ diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 7cf6f26d65ec..8e9203ff2b01 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -53,7 +53,6 @@ static int mfcc_get_window(struct mfcc_state *state, enum sof_mfcc_fft_window_ty case MFCC_POVEY_WINDOW: win_povey_16b(state->window, fft->fft_size); return 0; - default: return -EINVAL; } @@ -309,9 +308,33 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i state->cepstral_coef = NULL; } + /* Allocate output buffer for multi-period output. Size allows for + * current output data plus leftover from previous period. + */ + int max_out_per_hop = state->mel_only ? dct->num_in : dct->num_out; + + /* Check that output data can be drained within the periods spanned by one + * FFT hop. Each hop consumes fft_hop_size input samples and produces + * max_out_per_hop + 2 (magic) int16_t output values. The sink provides at + * least fft_hop_size * channels int16_t samples per hop (worst case s16). + * If output exceeds this, data accumulates and will eventually overflow. + */ + int out_per_hop = max_out_per_hop + 2; + int sink_per_hop = fft->fft_hop_size * channels; + + if (out_per_hop > sink_per_hop) { + comp_err(dev, "Output %d int16 per hop exceeds sink capacity %d (hop %d x ch %d)", + out_per_hop, sink_per_hop, fft->fft_hop_size, channels); + ret = -EINVAL; + goto free_dct_matrix; + } + /* Set initial state for STFT */ state->waiting_fill = true; state->prev_samples_valid = false; + state->magic_pending = false; + state->out_data_ptr = NULL; + state->out_remain = 0; comp_dbg(dev, "done"); return 0; diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index 37cf56c2ad98..c35b9217975b 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -128,7 +128,10 @@ struct mfcc_state { bool mel_only; /**< When true, output Mel spectra instead of cepstral coefficients */ bool waiting_fill; /**< booleans */ bool prev_samples_valid; + bool magic_pending; /**< True when magic word not yet written for current output */ size_t sample_buffers_size; /**< bytes */ + int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ + int out_remain; /**< Remaining int16_t samples to write to sink from scratch */ }; /* MFCC component private data */ From 17dede726088fc490a30fa7ffb0d10f5f48d1305 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Thu, 30 Apr 2026 20:00:03 +0300 Subject: [PATCH 07/12] Audio: MFCC: Update user/mfcc.h comment The description for top_db was was wrong. Signed-off-by: Seppo Ingalsuo --- src/include/user/mfcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/user/mfcc.h b/src/include/user/mfcc.h index 7a5b7fcca98e..f6308b468083 100644 --- a/src/include/user/mfcc.h +++ b/src/include/user/mfcc.h @@ -69,7 +69,7 @@ struct sof_mfcc_config { int16_t num_ceps; /**< Number of cepstral coefficients, e.g. 13 */ int16_t num_mel_bins; /**< Number of internal Mel bands, e.g. 23 */ int16_t preemphasis_coefficient; /**< Q1.15, e.g. 0.97, or 0 for disable */ - int16_t top_db; /**< Q8.7 dB, limit Mel energies to this value e.g. 200 */ + int16_t top_db; /**< Q8.7 dB, limit min. Mel energies to chunk max - top_dB, e.g. 80 */ int16_t vtln_high; /**< Reserved, no support */ int16_t vtln_low; /**< Reserved, no support */ int16_t vtln_warp; /**< Reserved, no support */ From 2ee9bed490cd434ce00789adaed8864730e6ff67 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Thu, 30 Apr 2026 20:00:41 +0300 Subject: [PATCH 08/12] Audio: MFCC: Add setup of Hann window The support for Hann window was missing from MFCC setup function. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_setup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 8e9203ff2b01..d75c53ff2caf 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -50,6 +50,9 @@ static int mfcc_get_window(struct mfcc_state *state, enum sof_mfcc_fft_window_ty case MFCC_HAMMING_WINDOW: win_hamming_16b(state->window, fft->fft_size); return 0; + case MFCC_HANN_WINDOW: + win_hann_16b(state->window, fft->fft_size); + return 0; case MFCC_POVEY_WINDOW: win_povey_16b(state->window, fft->fft_size); return 0; From 1c5decdfca7b7bda900758788abb4f3d0a05f2c2 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Thu, 30 Apr 2026 19:51:01 +0300 Subject: [PATCH 09/12] Audio: MFCC: Add dynamic mmax tracking and configurable mel scaling For compatibility with OpenVINO Whisper audio features this patch adds to function mfcc_stft_process() peak tracking of Mel spectra maximum in mel_only mode and clamp of Mel spectral values to found maximum minus config->top_db. The parameters for peak tracking and clamping are set via the configuration blob. The whisper audio features like absolute max behavior can be achieved with a mmax_coef zero. Then the mmax values rises to detected peak and remains there. The patch also adds normalization of Mel values with a configurable offset and scale. Whisper uses hard-coded values but making them configuration parameters from the blob is more flexible. The input parameter state is changed to struct mfcc_comp_data *cd to be able to access both state and configuration for the module. The ABI header user/mfcc.h is modified in a way that previous default operation for cepstral coefficients is not impacted. The new Mel only mode uses the added previous reserved fields in the configuration blob. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_common.c | 58 ++++++++++++++++++++++---- src/audio/mfcc/mfcc_setup.c | 1 + src/include/sof/audio/mfcc/mfcc_comp.h | 1 + src/include/user/mfcc.h | 8 +++- 4 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 986b13713199..bba1253f9740 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -36,8 +37,10 @@ LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL); * The main processing function for MFCC */ -static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_state *state) +static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *cd) { + struct sof_mfcc_config *config = cd->config; + struct mfcc_state *state = &cd->state; struct mfcc_buffer *buf = &state->buf; struct mfcc_fft *fft = &state->fft; int mel_scale_shift; @@ -45,6 +48,10 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_state *stat int i; int m; int cc_count = 0; + int32_t s; + int16_t mel_value; + int16_t peak; + int16_t clamp_value; /* Phase 1, wait until whole fft_size is filled with valid data. This way * first output cepstral coefficients originate from streamed data and not @@ -122,6 +129,43 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_state *stat if (state->mel_only) { /* In Mel-only mode output Mel log spectra directly */ cc_count += state->dct.num_in; + + /* Find peak mel value and track state->mmax */ + if (config->dynamic_mmax) { + peak = state->mel_spectra->data[0]; + for (i = 1; i < state->dct.num_in; i++) { + if (state->mel_spectra->data[i] > peak) + peak = state->mel_spectra->data[i]; + } + + /* Jump to peak immediately if higher, decay otherwise */ + if (peak > state->mmax) { + state->mmax = peak; + } else { + /* Q8.7 * Q1.15, result Q8.7. The coefficient is small so + * no need for saturation. + */ + s = (int32_t)peak - state->mmax; + state->mmax += + Q_MULTSR_32X32(s, config->mmax_coef, 7, 15, 7); + } + } + + /* Clamp Mel values lower than mmax - top_db, add offset, and scale */ + clamp_value = state->mmax - config->top_db; + for (i = 0; i < state->dct.num_in; i++) { + mel_value = state->mel_spectra->data[i]; + if (mel_value < clamp_value) + mel_value = clamp_value; + + /* Q8.7 * Q4.12, result 8.7 */ + s = (int32_t)mel_value + config->mel_offset; + state->mel_spectra->data[i] = + sat_int16(Q_MULTSR_32X32(s, config->mel_scale, 7, 12, 7)); + } + + /* Enable this to check mmax decay */ + comp_dbg(dev, "state->mmax = %d", state->mmax); } else { /* Multiply Mel spectra with DCT matrix to get cepstral coefficients */ mat_init_16b(state->cepstral_coef, 1, state->dct.num_out, 7); /* Q8.7 */ @@ -146,8 +190,8 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_state *stat } #if CONFIG_FORMAT_S16LE -static int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, - int16_t *w_ptr, int samples) +static int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, int16_t *w_ptr, + int samples) { int copied; int nmax; @@ -165,7 +209,7 @@ static int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, } static int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr, - int samples, int16_t *r_ptr) + int samples, int16_t *r_ptr) { int copied; int nmax; @@ -202,7 +246,7 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer mfcc_source_copy_s16(bsource, buf, &state->emph, frames, state->source_channel); /* Run STFT and processing after FFT: Mel auditory filter and DCT. */ - num_ceps = mfcc_stft_process(mod->dev, state); + num_ceps = mfcc_stft_process(mod->dev, cd); /* If new output produced, set up pointer into scratch data and mark magic pending */ if (num_ceps > 0) { @@ -299,7 +343,7 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel); /* Run STFT and processing after FFT */ - num_ceps = mfcc_stft_process(mod->dev, state); + num_ceps = mfcc_stft_process(mod->dev, cd); /* If new output produced, set up pointer into scratch data */ if (num_ceps > 0) { @@ -361,7 +405,7 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel); /* Run STFT and processing after FFT */ - num_ceps = mfcc_stft_process(mod->dev, state); + num_ceps = mfcc_stft_process(mod->dev, cd); /* If new output produced, set up pointer into scratch data */ if (num_ceps > 0) { diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index d75c53ff2caf..0a9fc19f0f53 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -152,6 +152,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i else state->source_channel = config->channel; + state->mmax = config->mmax_init; state->emph.enable = config->preemphasis_coefficient > 0; state->emph.coef = -config->preemphasis_coefficient; /* Negate config parameter */ fft->fft_size = config->frame_length; diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index c35b9217975b..accf45868cbd 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -114,6 +114,7 @@ struct mfcc_state { struct mat_matrix_16b *mel_spectra; /**< Pointer to scratch */ struct mat_matrix_16b *cepstral_coef; /**< Pointer to scratch */ int32_t *power_spectra; /**< Pointer to scratch */ + int16_t mmax; /**< Maximum Mel value in Q9.7 */ int16_t buf_avail; int16_t *buffers; int16_t *prev_data; /**< prev_data_size */ diff --git a/src/include/user/mfcc.h b/src/include/user/mfcc.h index f6308b468083..8a0defcd9883 100644 --- a/src/include/user/mfcc.h +++ b/src/include/user/mfcc.h @@ -50,7 +50,11 @@ enum sof_mfcc_dct_type { */ struct sof_mfcc_config { uint32_t size; /**< Size of this struct in bytes */ - uint32_t reserved[8]; + int16_t mel_offset; /**< Q8.7 default 0, use 4.0 for Whisper */ + int16_t mel_scale; /**< Q4.12 default 1.0, use 0.25 for Whisper */ + int16_t mmax_init; /**< Q8.7 default 0, with dynamic_mmax false, can sim. Whisper mmax */ + int16_t mmax_coef; /**< Q1.15 decay coefficient for dynamic mmax, a small value for slow */ + uint32_t reserved[6]; int32_t sample_frequency; /**< Hz. e.g. 16000 */ int32_t pmin; /**< Q1.31 linear power, limit minimum Mel energy, e.g. 1e-9 */ enum sof_mfcc_mel_log_type mel_log; /**< Use MEL_LOG_IS_LOG, LOG10 or DB*/ @@ -80,7 +84,7 @@ struct sof_mfcc_config { bool snip_edges; /**< Must be true (1) */ bool subtract_mean; /**< Must be false (0) */ bool use_energy; /**< Must be false (0) */ - bool reserved_bool1; + bool dynamic_mmax; /**< Track max Mel value for clamp with top_db value */ bool reserved_bool2; bool reserved_bool3; } __attribute__((packed)); From 7119ea4e475c6e13db8dc7cf896206085a4c1a30 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Wed, 29 Apr 2026 15:13:47 +0300 Subject: [PATCH 10/12] Audio: MFCC: Update setup_mfcc.m blob export script There are several changes: - The topology v1 format blob export is removed. It updates the MFCC module blob default.conf and adds a new blob mel_spectrogram.conf for topology v2. - The script is organized to be able to output multiple blobs. - The topology sof-hda-benchmark-mfcc16/24/32.tplg is using stereo data format, so the blob configuration -1 for channels to assume mono is wrong in setup_mfcc.m. - A blob for Mel frequency scale logarithic spectrum output is added. It sets num_ceps to zero to indicate Mel mode for MFCC. The parameters are set for Whisper compatible audio features with 80 Mel bins, Hann -window, FFT size 400 (padded to 512) with hop of 160. - The missing export of mel_log (log/log10/db) and norm parameters (none/slaney) is added. - Parameters are added for compability with OpenVINO's Whisper audio features extractor. The Mel values are clamped vs. tracked Mel values maximum and existing top_db parameter and normalized with a configurable offset and scale. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/tune/setup_mfcc.m | 144 +++++++++++++----- .../include/components/mfcc/default.conf | 8 +- .../include/components/mfcc/mel80.conf | 22 +++ 3 files changed, 135 insertions(+), 39 deletions(-) create mode 100644 tools/topology/topology2/include/components/mfcc/mel80.conf diff --git a/src/audio/mfcc/tune/setup_mfcc.m b/src/audio/mfcc/tune/setup_mfcc.m index e0d42e1e034d..bd2b3f11e60b 100644 --- a/src/audio/mfcc/tune/setup_mfcc.m +++ b/src/audio/mfcc/tune/setup_mfcc.m @@ -1,23 +1,36 @@ -% setup_mfcc(cfg) +% setup_mfcc() % -% Input -% cfg - optional MFCC configuration parameters struct, see -% below from code -% -% Create binary configuration blob for MFCC component. The hex data -% is written to tools/topology/topology2/include/components/mfcc and -% tools/topology/topology1/m4/mfcc. +% Create binary configuration blobs for the MFCC component. +% The hex data is written to files in directory +% tools/topology/topology2/include/components/mfcc. % SPDX-License-Identifier: BSD-3-Clause % -% Copyright (c) 2018-2026, Intel Corporation. All rights reserved. +% Copyright (c) 2018-2026, Intel Corporation. + +function setup_mfcc() + + gen_cfg.tplg_ver = 2; + gen_cfg.ipc_ver = 4; + gen_cfg.tools_path = '../../../../tools/'; + gen_cfg.mfcc_conf_path = [gen_cfg.tools_path 'topology/topology2/include/components/mfcc/']; + + % Default blob + setup = get_mfcc_default_config(); + setup.tplg_fn = 'default.conf'; + export_mfcc_setup(gen_cfg, setup); -function setup_mfcc(cfg) + % Blob for mel spectrogram data + setup = get_mel_spectrogram_config(); + setup.tplg_fn = 'mel80.conf'; + export_mfcc_setup(gen_cfg, setup); -if nargin < 1 +end + +function cfg = get_mfcc_default_config() cfg.blackman_coef = 0.42; cfg.cepstral_lifter = 22.0; - cfg.channel = -1; % -1 expect mono, 0 left, 1 right ... + cfg.channel = 0; % -1 expect mono, 0 left, 1 right ... cfg.dither = 0.0; % no support cfg.energy_floor = 1.0; cfg.frame_length = 25.0; % ms @@ -44,26 +57,54 @@ function setup_mfcc(cfg) cfg.mel_log = 'log'; % Set to 'db' for librosa, set to 'log10' for matlab cfg.pmin = 5e-10; % Set to 1e-10 for librosa cfg.top_db = 200; % Set to 80 for librosa + cfg.mel_offset = 0; % For mel_only mode, no impact with num_ceps > 0 + cfg.mel_scale = 0; % same + cfg.mmax_init = 0; % same + cfg.mmax_coef = 0; % same + cfg.dynamic_mmax = false; % same end -cfg.tools = '../../../../tools/'; - -cfg.tplg_fn = [cfg.tools 'topology/topology1/m4/mfcc/mfcc_config.m4']; -cfg.tplg_ver = 1; -cfg.ipc_ver = 3; -export_mfcc_setup(cfg); - -cfg.tplg_fn = [cfg.tools 'topology/topology2/include/components/mfcc/default.conf']; -cfg.tplg_ver = 2; -cfg.ipc_ver = 4; -export_mfcc_setup(cfg); - +function cfg = get_mel_spectrogram_config() + cfg.blackman_coef = 0; + cfg.cepstral_lifter = 0; + cfg.channel = 0; + cfg.dither = 0; + cfg.energy_floor = 1.0; + cfg.frame_length = 25.0; % 400 samples at 16 kHz + cfg.frame_shift = 10.0; % 160 samples at 16 kHz + cfg.high_freq = 8000; + cfg.htk_compat = false; + cfg.low_freq = 0; + cfg.num_ceps = 0; % Mel-only mode, no DCT + cfg.min_duration = 0; + cfg.norm = 'slaney'; + cfg.num_mel_bins = 80; + cfg.preemphasis_coefficient = 0; + cfg.raw_energy = false; + cfg.remove_dc_offset = false; + cfg.round_to_power_of_two = true; + cfg.sample_frequency = 16000; + cfg.snip_edges = true; + cfg.subtract_mean = false; + cfg.use_energy = false; + cfg.vtln_high = 0; + cfg.vtln_low = 0; + cfg.vtln_warp = 1.0; + cfg.window_type = 'hann'; + cfg.mel_log = 'log10'; + cfg.pmin = 1e-10; + cfg.top_db = 8; % applied for log10, would be 80 dB clamp for decibels as 10*log10() + cfg.mel_offset = 4.0; % For whisper like Mel scale and normalize + cfg.mel_scale = 0.25; % For whisper like Mel scale and normalize + cfg.mmax_init = 0; % Initial value max Mel value, data clamp is mmax - top_db + cfg.mmax_coef = 0; % Dynamic max Mel value decay coefficient (zero lock to found max) + cfg.dynamic_mmax = true; end -function export_mfcc_setup(cfg) +function export_mfcc_setup(gen_cfg, cfg) %% Use blob tool from EQ -addpath([cfg.tools 'tune/common']); +addpath([gen_cfg.tools_path 'tune/common']); %% Blob size, size plus reserved(8) + current parameters nbytes_data = 104; @@ -73,7 +114,7 @@ function export_mfcc_setup(cfg) sh16 = [0 -8]; %% Get ABI information -[abi_bytes, nbytes_abi] = sof_get_abi(nbytes_data, cfg.ipc_ver); +[abi_bytes, nbytes_abi] = sof_get_abi(nbytes_data, gen_cfg.ipc_ver); %% Initialize correct size uint8 array nbytes = nbytes_abi + nbytes_data; @@ -86,14 +127,21 @@ function export_mfcc_setup(cfg) %% Apply default MFCC configuration, first struct header and reserved, then data [b8, j] = add_w32b(nbytes_data, b8, j); -for i = 1:8 + +v = q_convert(cfg.mel_offset, 7); [b8, j] = add_w16b(v, b8, j); +v = q_convert(cfg.mel_scale, 12); [b8, j] = add_w16b(v, b8, j); +v = q_convert(cfg.mmax_init, 7); [b8, j] = add_w16b(v, b8, j); +v = q_convert(cfg.mmax_coef, 15); [b8, j] = add_w16b(v, b8, j); + +% Reserved +for i = 1:6 [b8, j] = add_w32b(0, b8, j); end v = q_convert(cfg.sample_frequency, 0); [b8, j] = add_w32b(v, b8, j); v = q_convert(cfg.pmin, 31); [b8, j] = add_w32b(v, b8, j); -v = 0; [b8, j] = add_w32b(v, b8, j); % enum mel_log -v = 0; [b8, j] = add_w32b(v, b8, j); % enum norm +v = get_mel_log_value(cfg.mel_log); [b8, j] = add_w32b(v, b8, j); % enum mel_log +v = get_norm_value(cfg.norm); [b8, j] = add_w32b(v, b8, j); % enum norm v = 0; [b8, j] = add_w32b(v, b8, j); % enum pad v = get_window(cfg); [b8, j] = add_w32b(v, b8, j); % enum window v = 1; [b8, j] = add_w32b(v, b8, j); % enum dct type @@ -119,22 +167,24 @@ function export_mfcc_setup(cfg) v = cfg.snip_edges; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.subtract_mean; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.use_energy; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.dynamic_mmax; [b8, j] = add_w8b(v, b8, j); % bool %% Export -switch cfg.tplg_ver +tplg_fn = [gen_cfg.mfcc_conf_path cfg.tplg_fn]; +switch gen_cfg.tplg_ver case 1 - sof_tplg_write(cfg.tplg_fn, b8, "DEF_MFCC_PRIV", ... + sof_tplg_write(tplg_fn, b8, "DEF_MFCC_PRIV", ... "Exported with script setup_mfcc.m", ... "cd src/audio/mfcc/tune; octave setup_mfcc.m"); case 2 - sof_tplg2_write(cfg.tplg_fn, b8, "mfcc_config", ... + sof_tplg2_write(tplg_fn, b8, "mfcc_config", ... "Exported MFCC configuration", ... "cd src/audio/mfcc/tune; octave setup_mfcc.m"); otherwise - error("Illegal cfg.tplg_ver, use 1 for topology v1 or 2 topology v2."); + error("Illegal tplg_ver, use 1 for topology v1 or 2 topology v2."); end -rmpath([cfg.tools 'tune/common']); +rmpath([gen_cfg.tools_path 'tune/common']); end @@ -157,6 +207,30 @@ function export_mfcc_setup(cfg) end end +function n = get_mel_log_value(mel_log) + switch lower(mel_log) + case 'log' + n = 0; + case 'log10' + n = 1; + case 'db' + n = 2; + otherwise + error('Unknown mel_log type'); + end +end + +function n = get_norm_value(norm) + switch lower(norm) + case 'none' + n = 0; + case 'slaney' + n = 1; + otherwise + error('Unknown norm type'); + end +end + function bytes = w8b(word) bytes = uint8(zeros(1,1)); bytes(1) = bitand(word, 255); diff --git a/tools/topology/topology2/include/components/mfcc/default.conf b/tools/topology/topology2/include/components/mfcc/default.conf index 1f9141886de9..42a6d6608b8b 100644 --- a/tools/topology/topology2/include/components/mfcc/default.conf +++ b/tools/topology/topology2/include/components/mfcc/default.conf @@ -1,9 +1,9 @@ -# Exported MFCC configuration 24-Jul-2024 -# cd tools/tune/mfcc; octave setup_mfcc.m +# Exported MFCC configuration 05-May-2026 +# cd src/audio/mfcc/tune; octave setup_mfcc.m Object.Base.data."mfcc_config" { bytes " 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x00,0xa0,0x01,0x03, + 0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x68,0x00,0x00,0x00,0x00,0x00,0x00,0x00, @@ -14,7 +14,7 @@ Object.Base.data."mfcc_config" { 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x02,0x00,0x00,0x00,0x01,0x00,0x00,0x00, - 0xc3,0x35,0x00,0x2c,0xff,0xff,0x00,0x00, + 0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00, 0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00, 0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, diff --git a/tools/topology/topology2/include/components/mfcc/mel80.conf b/tools/topology/topology2/include/components/mfcc/mel80.conf new file mode 100644 index 000000000000..04aa2a15c660 --- /dev/null +++ b/tools/topology/topology2/include/components/mfcc/mel80.conf @@ -0,0 +1,22 @@ +# Exported MFCC configuration 05-May-2026 +# cd src/audio/mfcc/tune; octave setup_mfcc.m +Object.Base.data."mfcc_config" { + bytes " + 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, + 0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x68,0x00,0x00,0x00,0x00,0x02,0x00,0x04, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00, + 0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x03,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00, + 0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x01,0x01,0x00,0x00,0x01,0x00,0x00" +} From 020de6f086e6ad68a36cc70b0008e2557244b9a1 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Mon, 4 May 2026 15:35:34 +0300 Subject: [PATCH 11/12] Tools: Topology: Build test topologies for Mel output mode of MFCC This patch adds build of test topologies to test OpenVINO Whisper audio features extractor compatible setup for SOF MFCC. The topology names are sof-hda-benchmark-mfccmel16/24/32.tplg. The MFCC module is initialized to produce spectrogram data for 80 Mel frequency bands. Signed-off-by: Seppo Ingalsuo --- tools/topology/topology2/cavs-benchmark-hda.conf | 10 ++++++++++ .../topology2/development/tplg-targets-bench.cmake | 2 ++ .../include/bench/mfcc_controls_capture.conf | 1 + .../include/bench/mfcc_controls_playback.conf | 1 + .../topology2/include/bench/mfccmel_s16.conf | 13 +++++++++++++ .../topology2/include/bench/mfccmel_s24.conf | 13 +++++++++++++ .../topology2/include/bench/mfccmel_s32.conf | 13 +++++++++++++ 7 files changed, 53 insertions(+) create mode 100644 tools/topology/topology2/include/bench/mfccmel_s16.conf create mode 100644 tools/topology/topology2/include/bench/mfccmel_s24.conf create mode 100644 tools/topology/topology2/include/bench/mfccmel_s32.conf diff --git a/tools/topology/topology2/cavs-benchmark-hda.conf b/tools/topology/topology2/cavs-benchmark-hda.conf index 62c0ad4f4fbc..95ab67431812 100644 --- a/tools/topology/topology2/cavs-benchmark-hda.conf +++ b/tools/topology/topology2/cavs-benchmark-hda.conf @@ -834,6 +834,16 @@ IncludeByKey.BENCH_CONFIG { } + "mfccmel16" { + + } + "mfccmel24" { + + } + "mfccmel32" { + + } + # # Micsel component # diff --git a/tools/topology/topology2/development/tplg-targets-bench.cmake b/tools/topology/topology2/development/tplg-targets-bench.cmake index eff707d49aa9..5c0f82dc7dfc 100644 --- a/tools/topology/topology2/development/tplg-targets-bench.cmake +++ b/tools/topology/topology2/development/tplg-targets-bench.cmake @@ -19,6 +19,7 @@ set(components "igo_nr" "level_multiplier" "mfcc" + "mfccmel" "micsel" "rtnr" "sound_dose" @@ -45,6 +46,7 @@ set(component_parameters "BENCH_IGO_NR_PARAMS=default" "BENCH_LEVEL_MULTIPLIER_PARAMS=default" "BENCH_MFCC_PARAMS=default" + "BENCH_MFCC_PARAMS=mel80" "BENCH_MICSEL_PARAMS=passthrough" "BENCH_RTNR_PARAMS=default" "BENCH_SOUND_DOSE_PARAMS=default" diff --git a/tools/topology/topology2/include/bench/mfcc_controls_capture.conf b/tools/topology/topology2/include/bench/mfcc_controls_capture.conf index 56a731b86687..d45baec1ee8f 100644 --- a/tools/topology/topology2/include/bench/mfcc_controls_capture.conf +++ b/tools/topology/topology2/include/bench/mfcc_controls_capture.conf @@ -6,6 +6,7 @@ name '$ANALOG_CAPTURE_PCM MFCC bytes' IncludeByKey.BENCH_MFCC_PARAMS { "default" "include/components/mfcc/default.conf" + "mel80" "include/components/mfcc/mel80.conf" } } #mixer."1" { diff --git a/tools/topology/topology2/include/bench/mfcc_controls_playback.conf b/tools/topology/topology2/include/bench/mfcc_controls_playback.conf index 7649678c8468..cc2ada04b8d7 100644 --- a/tools/topology/topology2/include/bench/mfcc_controls_playback.conf +++ b/tools/topology/topology2/include/bench/mfcc_controls_playback.conf @@ -6,6 +6,7 @@ name '$ANALOG_PLAYBACK_PCM MFCC bytes' IncludeByKey.BENCH_MFCC_PARAMS { "default" "include/components/mfcc/default.conf" + "mel80" "include/components/mfcc/mel80.conf" } } #mixer."1" { diff --git a/tools/topology/topology2/include/bench/mfccmel_s16.conf b/tools/topology/topology2/include/bench/mfccmel_s16.conf new file mode 100644 index 000000000000..ec89bffb90a1 --- /dev/null +++ b/tools/topology/topology2/include/bench/mfccmel_s16.conf @@ -0,0 +1,13 @@ + # Created with script "./bench_comp_generate.sh mfcc" + Object.Widget.mfcc.1 { + index $BENCH_PLAYBACK_HOST_PIPELINE + + + } + Object.Widget.mfcc.2 { + index $BENCH_CAPTURE_HOST_PIPELINE + + + } + + diff --git a/tools/topology/topology2/include/bench/mfccmel_s24.conf b/tools/topology/topology2/include/bench/mfccmel_s24.conf new file mode 100644 index 000000000000..73571fabe5f2 --- /dev/null +++ b/tools/topology/topology2/include/bench/mfccmel_s24.conf @@ -0,0 +1,13 @@ + # Created with script "./bench_comp_generate.sh mfcc" + Object.Widget.mfcc.1 { + index $BENCH_PLAYBACK_HOST_PIPELINE + + + } + Object.Widget.mfcc.2 { + index $BENCH_CAPTURE_HOST_PIPELINE + + + } + + diff --git a/tools/topology/topology2/include/bench/mfccmel_s32.conf b/tools/topology/topology2/include/bench/mfccmel_s32.conf new file mode 100644 index 000000000000..75c01eaf4a43 --- /dev/null +++ b/tools/topology/topology2/include/bench/mfccmel_s32.conf @@ -0,0 +1,13 @@ + # Created with script "./bench_comp_generate.sh mfcc" + Object.Widget.mfcc.1 { + index $BENCH_PLAYBACK_HOST_PIPELINE + + + } + Object.Widget.mfcc.2 { + index $BENCH_CAPTURE_HOST_PIPELINE + + + } + + From b39cc4e3102fd9b69b4355ab4aacc16802493934 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 28 Apr 2026 14:20:03 +0300 Subject: [PATCH 12/12] Audio: MFCC: Update example run script run_mfcc.sh This patch contains several updates: - The run is with valgrind is added to catch memory leaks. - The script applied duplicate "-i" and "-o" arguments. They are removed from "OPT" variables. - The sof-testbench4 can't override the channels count in topology similarly as the IPC3 testbench could. Since the current topology is for stereo 16 kHz the input data and command line must be for such too. - To be able to compare MFCC output for successive runs, the "-R" option is added to run of sox audio convert utility to prevent e.g. randomization of dither. - The script converts input to s24 and s32 formats and runs them for easier check for correct operation with supported formats. The conversion is done from the s16 version to be able to compare the output audio features those should be the same if internal processing is 16 bit. - A run with Mel configured MFCC is added for s16/24/32 formats. - A script to decode and visualize Mel spectrogram data is added as decode_mel.m. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/tune/README.txt | 8 ++- src/audio/mfcc/tune/decode_mel.m | 101 +++++++++++++++++++++++++++++++ src/audio/mfcc/tune/run_mfcc.sh | 54 ++++++++++++++--- 3 files changed, 152 insertions(+), 11 deletions(-) create mode 100644 src/audio/mfcc/tune/decode_mel.m diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt index fb8208992ed4..7ea6618896b9 100644 --- a/src/audio/mfcc/tune/README.txt +++ b/src/audio/mfcc/tune/README.txt @@ -16,7 +16,7 @@ The output file is hard-coded to mfcc.raw. The output can be plotted and retrieved with Matlab or Octave command: -[ceps, t, n] = decode_ceps('mfcc.raw', 13); +[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13); In the above it's known from configuration script that MFCC was set up to output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral @@ -27,3 +27,9 @@ e.g. other sound files found in computer. ./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg ./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg + +The script runs the same input sample with s16/24/32 formats for +cepstral coefficients data output and Mel frequency spectrogram +output. The 80 bands Mel output can be visualized with command: + +[ceps, t, n] = decode_mel('mel_s16.raw', 80); diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m new file mode 100644 index 000000000000..c52ad4b9f6d9 --- /dev/null +++ b/src/audio/mfcc/tune/decode_mel.m @@ -0,0 +1,101 @@ +% [mel, t, n] = decode_mel(fn, num_mel, num_channels) +% +% Input +% fn - File with MFCC data in .raw or .wav format +% num_mel - number of Mel coefficients per frame +% num_channels - needed for .raw format, omit for .wav +% +% Outputs +% mel - Mel coefficients +% t - time vector for plotting +% n - mel 1..num_mel vector for plotting + +% SPDX-License-Identifier: BSD-3-Clause +% Copyright(c) 2026 Intel Corporation. + +function [mel, t, n] = decode_mel(fn, num_mel, num_channels) + +if nargin < 3 + num_channels = 1; +end + +% MFCC stream +fs = 16e3; +qformat = 7; +magic = [25443 28006]; % ASCII 'mfcc' as int16 + +% Load output data +[data, num_channels] = get_file(fn, num_channels); + +idx1 = find(data == magic(1)); +idx = []; +for i = 1:length(idx1) + if data(idx1(i) + 1) == magic(2) + idx = [idx idx1(i)]; + end +end + +if isempty(idx) + error('No magic value markers found from stream'); +end + +period_mel = idx(2)-idx(1); +num_frames = length(idx); + +% Last frame can be incomplete due to span over multiple periods +last = idx(end) + num_mel - 1; +if (last > length(data)) + num_frames = num_frames - 1; +end + +t_mel = period_mel / num_channels / fs; +t = (0:num_frames -1) * t_mel; +n = 1:num_mel; + +mel = zeros(num_mel, num_frames); +for i = 1:num_frames + i1 = idx(i) + 2; + i2 = i1 + num_mel - 1; + mel(:,i) = data(i1:i2) / 2^qformat; +end + +figure; +imagesc(t, n, mel); +axis xy; +colormap(jet); +colorbar; +tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn); +title(tstr, 'Interpreter', 'None'); +xlabel('Time (s)'); +ylabel('Mel coef #'); + +end + +function [data, num_channels] = get_file(fn, num_channels) + +[~, ~, ext] = fileparts(fn); + +switch lower(ext) + case '.raw' + fh = fopen(fn, 'r'); + data = fread(fh, 'int16'); + fclose(fh); + case '.wav' + tmp = audioread(fn, 'native'); + t = whos('tmp'); + if ~strcmp(t.class, 'int16') + error('Only 16-bit wav file format is supported'); + end + s = size(tmp); + num_channels = s(2); + if num_channels > 1 + data = int16(zeros(prod(s), 1)); + for i = 1:num_channels + data(i:num_channels:end) = tmp(:, i); + end + end + otherwise + error('Unknown audio format'); +end + +end diff --git a/src/audio/mfcc/tune/run_mfcc.sh b/src/audio/mfcc/tune/run_mfcc.sh index d531e4519755..a1b8030a6063 100755 --- a/src/audio/mfcc/tune/run_mfcc.sh +++ b/src/audio/mfcc/tune/run_mfcc.sh @@ -4,19 +4,53 @@ set -e -RAW_INPUT=in.raw -RAW_OUTPUT=mfcc.raw +RAW_INPUT_S16=in_s16.raw +RAW_INPUT_S24=in_s24.raw +RAW_INPUT_S32=in_s32.raw +RAW_OUTPUT_S16=mfcc_s16.raw +RAW_OUTPUT_S24=mfcc_s24.raw +RAW_OUTPUT_S32=mfcc_s32.raw +VALGRIND="valgrind --leak-check=full" TESTBENCH=$SOF_WORKSPACE/sof/tools/testbench/build_testbench/install/bin/sof-testbench4 -TOPOLOGY=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfcc16.tplg -OPT="-r 16000 -c 2 -b S16_LE -p 3,4 -t $TOPOLOGY -i $RAW_INPUT -o $RAW_OUTPUT" +TOPOLOGY_S16=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfcc16.tplg +TOPOLOGY_S24=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfcc24.tplg +TOPOLOGY_S32=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfcc32.tplg +OPT_S16="-r 16000 -c 2 -b S16_LE -p 3,4 -t $TOPOLOGY_S16" +OPT_S24="-r 16000 -c 2 -b S24_LE -p 3,4 -t $TOPOLOGY_S24" +OPT_S32="-r 16000 -c 2 -b S32_LE -p 3,4 -t $TOPOLOGY_S32" -# Convert input audio file raw 16 kHz 1 channel 16 bit -sox --encoding signed-integer "$1" -L -r 16000 -c 1 -b 16 "$RAW_INPUT" +# Convert input audio file raw 16 kHz 2 channel 16 bit +sox -R --encoding signed-integer "$1" -L -r 16000 -c 2 -b 16 "$RAW_INPUT_S16" +sox -R --no-dither --encoding signed-integer -L -r 16000 -c 2 -b 16 "$RAW_INPUT_S16" -b 32 "$RAW_INPUT_S32" +sox -R --no-dither --encoding signed-integer -L -r 16000 -c 2 -b 16 "$RAW_INPUT_S16" -b 32 "$RAW_INPUT_S24" vol 0.003906250000 # Run testbench -$TESTBENCH $OPT -i "$RAW_INPUT" -o "$RAW_OUTPUT" +$VALGRIND $TESTBENCH $OPT_S16 -i "$RAW_INPUT_S16" -o "$RAW_OUTPUT_S16" +$VALGRIND $TESTBENCH $OPT_S24 -i "$RAW_INPUT_S24" -o "$RAW_OUTPUT_S24" +$VALGRIND $TESTBENCH $OPT_S32 -i "$RAW_INPUT_S32" -o "$RAW_OUTPUT_S32" -echo ----------------------------------------------- -echo The MFCC data was output to file $RAW_OUTPUT -echo ----------------------------------------------- +echo ---------------------------------------------------------------------------------- +echo The MFCC data was output to file $RAW_OUTPUT_S16, $RAW_OUTPUT_S24, $RAW_OUTPUT_S32 +echo ---------------------------------------------------------------------------------- + +RAW_OUTPUT_S16=mel_s16.raw +RAW_OUTPUT_S24=mel_s24.raw +RAW_OUTPUT_S32=mel_s32.raw + +TESTBENCH=$SOF_WORKSPACE/sof/tools/testbench/build_testbench/install/bin/sof-testbench4 +TOPOLOGY_S16=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfccmel16.tplg +TOPOLOGY_S24=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfccmel24.tplg +TOPOLOGY_S32=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfccmel32.tplg +OPT_S16="-r 16000 -c 2 -b S16_LE -p 3,4 -t $TOPOLOGY_S16" +OPT_S24="-r 16000 -c 2 -b S24_LE -p 3,4 -t $TOPOLOGY_S24" +OPT_S32="-r 16000 -c 2 -b S32_LE -p 3,4 -t $TOPOLOGY_S32" + +# Run testbench +$VALGRIND $TESTBENCH $OPT_S16 -i "$RAW_INPUT_S16" -o "$RAW_OUTPUT_S16" +$VALGRIND $TESTBENCH $OPT_S24 -i "$RAW_INPUT_S24" -o "$RAW_OUTPUT_S24" +$VALGRIND $TESTBENCH $OPT_S32 -i "$RAW_INPUT_S32" -o "$RAW_OUTPUT_S32" + +echo ---------------------------------------------------------------------------------- +echo The MFCC Mel data was output to file $RAW_OUTPUT_S16, $RAW_OUTPUT_S24, $RAW_OUTPUT_S32 +echo ----------------------------------------------------------------------------------