arm_biquad_cascade_df2T_f32.c - OpenGrok cross reference for /cmsis-dsp-latest/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c

/* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_biquad_cascade_df2T_f32.c
 * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter
 *
 * $Date:        23 April 2021
 * $Revision:    V1.9.0
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
/*
 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "dsp/filtering_functions.h"

/**
  @ingroup groupFilters
*/

/**
  @addtogroup BiquadCascadeDF2T
  @{
 */

/**
  @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
  @param[in]     S         points to an instance of the filter data structure
  @param[in]     pSrc      points to the block of input data
  @param[out]    pDst      points to the block of output data
  @param[in]     blockSize number of samples to process
 */
#if (defined(ARM_MATH_MVEF) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"

ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f32(
  const arm_biquad_cascade_df2T_instance_f32 * S,
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
{
    const float32_t *pIn = pSrc;                  /*  source pointer            */
    float32_t Xn0, Xn1;
    float32_t acc0, acc1;
    float32_t *pOut = pDst;                 /*  destination pointer       */
    float32_t *pState = S->pState;          /*  State pointer             */
    uint32_t  sample, stage = S->numStages; /*  loop counters             */
    float32_t const *pCurCoeffs =          /*  coefficient pointer       */
                (float32_t const *) S->pCoeffs;
    f32x4_t b0Coeffs, a0Coeffs;           /*  Coefficients vector       */
    f32x4_t b1Coeffs, a1Coeffs;           /*  Modified coef. vector     */
    f32x4_t state;                        /*  State vector              */

    do
    {
        /*
         * temporary carry variable for feeding the 128-bit vector shifter
         */
        uint32_t  tmp = 0;
        /*
         * Reading the coefficients
         * b0Coeffs = {b0, b1, b2, x}
         * a0Coeffs = { x, a1, a2, x}
         */
        b0Coeffs = vld1q(pCurCoeffs);   pCurCoeffs+= 2;
        a0Coeffs = vld1q(pCurCoeffs);   pCurCoeffs+= 3;
        /*
         * Reading the state values
         * state = {d1, d2,  0, 0}
         */
        state = *(f32x4_t *) pState;
        state = vsetq_lane(0.0f, state, 2);
        state = vsetq_lane(0.0f, state, 3);

        /* b1Coeffs = {b0, b1, b2, x} */
        /* b1Coeffs = { x, x, a1, a2} */
        b1Coeffs = (f32x4_t)vshlcq_s32((int32x4_t)b0Coeffs, &tmp, 32);
        a1Coeffs = (f32x4_t)vshlcq_s32((int32x4_t)a0Coeffs, &tmp, 32);

        sample = blockSize / 2;

        /* unrolled 2 x */
        while (sample > 0U)
        {
            /*
             * Read 2 inputs
             */
            Xn0 = *pIn++;
            Xn1 = *pIn++;

            /*
             * 1st half:
             * / acc0 \   / b0 \         / d1 \   / 0  \
             * |  d1  | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
             * |  d2  |   | b2 |         | 0  |   | a2 |
             * \  x   /   \ x  /         \ x  /   \ x  /
             */

            state = vfmaq(state, b0Coeffs, Xn0);
            acc0 = vgetq_lane(state, 0);
            state = vfmaq(state, a0Coeffs, acc0);
            state = vsetq_lane(0.0f, state, 3);

            /*
             * 2nd half:
             * same as 1st half, but all vector elements shifted down.
             * /  x   \   / x  \         / x  \   / x  \
             * | acc1 | = | b0 | * Xn1 + | d1 | + | 0  | x acc1
             * |  d1  |   | b1 |         | d2 |   | a1 |
             * \  d2  /   \ b2 /         \ 0  /   \ a2 /
             */

            state = vfmaq(state, b1Coeffs, Xn1);
            acc1 = vgetq_lane(state, 1);
            state = vfmaq(state, a1Coeffs, acc1);

            /* move d1, d2 up + clearing */
            /* expect dual move or long move */
            state = vsetq_lane(vgetq_lane(state, 2), state, 0);
            state = vsetq_lane(vgetq_lane(state, 3), state, 1);
            state = vsetq_lane(0.0f, state, 2);
            /*
             * Store the results in the destination buffer.
             */
            *pOut++ = acc0;
            *pOut++ = acc1;
            /*
             * decrement the loop counter
             */
            sample--;
        }

        /*
         * tail handling
         */
        if (blockSize & 1)
        {
            Xn0 = *pIn++;
            state = vfmaq(state, b0Coeffs, Xn0);
            acc0 = vgetq_lane(state, 0);

            state = vfmaq(state, a0Coeffs, acc0);
            *pOut++ = acc0;
            *pState++ = vgetq_lane(state, 1);
            *pState++ = vgetq_lane(state, 2);
        }
        else
        {
            *pState++ = vgetq_lane(state, 0);
            *pState++ = vgetq_lane(state, 1);
        }
        /*
         * The current stage output is given as the input to the next stage
         */
        pIn = pDst;
        /*
         * Reset the output working pointer
         */
        pOut = pDst;
        /*
         * decrement the loop counter
         */
        stage--;
    }
    while (stage > 0U);
}
#else
#if defined(ARM_MATH_NEON)

ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f32(
  const arm_biquad_cascade_df2T_instance_f32 * S,
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
{
   const float32_t *pIn = pSrc;                   /*  source pointer            */
   float32_t *pOut = pDst;                        /*  destination pointer       */
   float32_t *pState = S->pState;                 /*  State pointer             */
   const float32_t *pCoeffs = S->pCoeffs;         /*  coefficient pointer       */
   float32_t acc1;                                /*  accumulator               */
   float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
   float32_t Xn1;                                 /*  temporary input           */
   float32_t d1, d2;                              /*  state variables           */
   uint32_t sample, stageCnt,stage = S->numStages;         /*   loop counters   */


   float32x4_t XnV, YnV;
   float32x4x2_t dV;
   float32x4_t zeroV = vdupq_n_f32(0.0);
   float32x4_t t1,t2,t3,t4,b1V,b2V,a1V,a2V,s;

   /* Loop unrolling. Compute 4 outputs at a time */
   stageCnt = stage >> 2;

   while (stageCnt > 0U)
   {
      /* Reading the coefficients */
      t1 = vld1q_f32(pCoeffs);
      pCoeffs += 4;

      t2 = vld1q_f32(pCoeffs);
      pCoeffs += 4;

      t3 = vld1q_f32(pCoeffs);
      pCoeffs += 4;

      t4 = vld1q_f32(pCoeffs);
      pCoeffs += 4;

      b1V = vld1q_f32(pCoeffs);
      pCoeffs += 4;

      b2V = vld1q_f32(pCoeffs);
      pCoeffs += 4;

      a1V = vld1q_f32(pCoeffs);
      pCoeffs += 4;

      a2V = vld1q_f32(pCoeffs);
      pCoeffs += 4;

      /* Reading the state values */
      dV = vld2q_f32(pState);

      sample = blockSize;

      while (sample > 0U) {
         /* y[n] = b0 * x[n] + d1 */
         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
         /* d2 = b2 * x[n] + a2 * y[n] */

         XnV = vdupq_n_f32(*pIn++);

         s = dV.val[0];
         YnV = s;

         s = vextq_f32(zeroV,dV.val[0],3);
         YnV = vmlaq_f32(YnV, t1, s);

         s = vextq_f32(zeroV,dV.val[0],2);
         YnV = vmlaq_f32(YnV, t2, s);

         s = vextq_f32(zeroV,dV.val[0],1);
         YnV = vmlaq_f32(YnV, t3, s);

         YnV = vmlaq_f32(YnV, t4, XnV);

         s = vextq_f32(XnV,YnV,3);

         dV.val[0] = vmlaq_f32(dV.val[1], s, b1V);
         dV.val[0] = vmlaq_f32(dV.val[0], YnV, a1V);

         dV.val[1] = vmulq_f32(s, b2V);
         dV.val[1] = vmlaq_f32(dV.val[1], YnV, a2V);

         *pOut++ = vgetq_lane_f32(YnV, 3) ;

         sample--;
      }

      /* Store the updated state variables back into the state array */
      vst2q_f32(pState,dV);
      pState += 8;

      /* The current stage output is given as the input to the next stage */
      pIn = pDst;

      /*Reset the output working pointer */
      pOut = pDst;

      /* decrement the loop counter */
      stageCnt--;

   }

   /* Tail */
   stageCnt = stage & 3;

   while (stageCnt > 0U)
   {
      /* Reading the coefficients */
      b0 = *pCoeffs++;
      b1 = *pCoeffs++;
      b2 = *pCoeffs++;
      a1 = *pCoeffs++;
      a2 = *pCoeffs++;

      /*Reading the state values */
      d1 = pState[0];
      d2 = pState[1];

      sample = blockSize;

      while (sample > 0U)
      {
         /* Read the input */
         Xn1 = *pIn++;

         /* y[n] = b0 * x[n] + d1 */
         acc1 = (b0 * Xn1) + d1;

         /* Store the result in the accumulator in the destination buffer. */
         *pOut++ = acc1;

         /* Every time after the output is computed state should be updated. */
         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
         d1 = ((b1 * Xn1) + (a1 * acc1)) + d2;

         /* d2 = b2 * x[n] + a2 * y[n] */
         d2 = (b2 * Xn1) + (a2 * acc1);

         /* decrement the loop counter */
         sample--;
      }

      /* Store the updated state variables back into the state array */
      *pState++ = d1;
      *pState++ = d2;

      /* The current stage output is given as the input to the next stage */
      pIn = pDst;

      /*Reset the output working pointer */
      pOut = pDst;

      /* decrement the loop counter */
      stageCnt--;
   }
}
#else

ARM_DSP_ATTRIBUTE void arm_biquad_cascade_df2T_f32(
  const arm_biquad_cascade_df2T_instance_f32 * S,
  const float32_t * pSrc,
        float32_t * pDst,
        uint32_t blockSize)
{
  const float32_t *pIn = pSrc;                         /* Source pointer */
        float32_t *pOut = pDst;                        /* Destination pointer */
        float32_t *pState = S->pState;                 /* State pointer */
  const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
        float32_t acc1;                                /* Accumulator */
        float32_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
        float32_t Xn1;                                 /* Temporary input */
        float32_t d1, d2;                              /* State variables */
        uint32_t sample, stage = S->numStages;         /* Loop counters */

  do
  {
     /* Reading the coefficients */
     b0 = pCoeffs[0];
     b1 = pCoeffs[1];
     b2 = pCoeffs[2];
     a1 = pCoeffs[3];
     a2 = pCoeffs[4];

     /* Reading the state values */
     d1 = pState[0];
     d2 = pState[1];

     pCoeffs += 5U;

#if defined (ARM_MATH_LOOPUNROLL)

     /* Loop unrolling: Compute 16 outputs at a time */
     sample = blockSize >> 4U;

     while (sample > 0U) {

       /* y[n] = b0 * x[n] + d1 */
       /* d1 = b1 * x[n] + a1 * y[n] + d2 */
       /* d2 = b2 * x[n] + a2 * y[n] */

/*  1 */
       Xn1 = *pIn++;

       acc1 = b0 * Xn1 + d1;

       d1 = b1 * Xn1 + d2;
       d1 += a1 * acc1;

       d2 = b2 * Xn1;
       d2 += a2 * acc1;

       *pOut++ = acc1;

/*  2 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/*  3 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/*  4 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/*  5 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/*  6 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/*  7 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/*  8 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/*  9 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/* 10 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/* 11 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/* 12 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/* 13 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/* 14 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/* 15 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

/* 16 */
         Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

        /* decrement loop counter */
        sample--;
      }

      /* Loop unrolling: Compute remaining outputs */
      sample = blockSize & 0xFU;

#else

      /* Initialize blkCnt with number of samples */
      sample = blockSize;

#endif /* #if defined (ARM_MATH_LOOPUNROLL) */

      while (sample > 0U) {
        Xn1 = *pIn++;

        acc1 = b0 * Xn1 + d1;

        d1 = b1 * Xn1 + d2;
        d1 += a1 * acc1;

        d2 = b2 * Xn1;
        d2 += a2 * acc1;

        *pOut++ = acc1;

        /* decrement loop counter */
        sample--;
      }

      /* Store the updated state variables back into the state array */
      pState[0] = d1;
      pState[1] = d2;

      pState += 2U;

      /* The current stage output is given as the input to the next stage */
      pIn = pDst;

      /* Reset the output working pointer */
      pOut = pDst;

      /* decrement loop counter */
      stage--;

   } while (stage > 0U);

}

#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

/**
  @} end of BiquadCascadeDF2T group
 */