arm_levinson_durbin_q31.c - OpenGrok cross reference for /cmsis-dsp-latest/Source/FilteringFunctions/arm_levinson_durbin_q31.c

/* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_levinson_durbin_q31.c
 * Description:  q31 version of Levinson Durbin algorithm
 *
 * $Date:        23 April 2021
 * $Revision:    V1.9.0
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
/*
 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "dsp/filtering_functions.h"

#define ONE_Q31 0x7FFFFFFFL
#define TWO_Q30 0x7FFFFFFFL

#define HALF_Q31 0x00008000L
#define ONE_Q15 0x7FFF
#define HALF_Q15 0x3FFF
#define LOWPART_MASK 0x07FFF

__STATIC_FORCEINLINE q31_t mul32x16(q31_t a, q15_t b)
{
  q31_t r = ((q63_t)a * (q63_t)b) >> 15;

  return(r);

}

__STATIC_FORCEINLINE q31_t mul32x32(q31_t a, q31_t b)
{
  //q31_t r = __SSAT(((q63_t)a * b) >> 31,31);
  q31_t r = ((q63_t)a * b) >> 31;

  return(r);

}

__STATIC_FORCEINLINE q31_t divide(q31_t n, q31_t d)
{
  arm_status status;
  int16_t shift;
  q15_t inverse;
  q31_t r;
  // We are computing:
  // n / d = n / (h + l) where h and l are the high end and low end part.
  // 1 / (h + l) = 1 / h (1 - l / h)
  // Our division algorithm has a shift. So it is returning a scaled value sh.
  // So we need a << shift to convert 1/ sh to 1/h.
  // In below code, we are organizing the computation differently. Instead of computing:
  // 1 / h (1 - l / h)
  // we are computing
  // 1 / h (2 - (l + h) / h)
  // 1 / h (2 - d / h)
  // Also, we are not computing 1/h in Q15 but in Q14.
  // 2 is expressed in Q30.
  // So at the end of all computation we need a << 2

  // Result is in Q14 because of use of HALF_Q15 instead of ONE_Q15.
  status=arm_divide_q15(HALF_Q15,d>>16,&inverse,&shift);
  (void)status;

  // d is used instead of l
  // So we will need to substract to 2 instead of 1.
  r = mul32x16(d,inverse);
  r = TWO_Q30 - (r << shift);
  r = mul32x16(r, inverse);
  r = mul32x32(r,n) ;
  r = r << (shift + 2);

  return(r);

}

/**
  @ingroup groupFilters
 */


/**
  @addtogroup LD
  @{
 */

/**
  @brief         Levinson Durbin
  @param[in]     phi      autocovariance vector starting with lag 0 (length is nbCoefs + 1)
  @param[out]    a        autoregressive coefficients
  @param[out]    err      prediction error (variance)
  @param[in]     nbCoefs  number of autoregressive coefficients
 */

#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(ARM_DSP_BUILT_WITH_GCC)
#pragma message "Scalar version of arm_levinson_durbin_q31 built. Helium version has build issues with gcc."
#endif

#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(ARM_DSP_BUILT_WITH_GCC)

#define LANE23_MASK 0xFF00

#include "arm_helium_utils.h"
ARM_DSP_ATTRIBUTE void arm_levinson_durbin_q31(const q31_t *phi,
  q31_t *a,
  q31_t *err,
  int nbCoefs)
{
    q31_t e;

    static const uint32_t revOffsetArray[4] = {3,2,1,0};

   //a[0] = phi[1] / phi[0];
   a[0] = divide(phi[1], phi[0]);


   //e = phi[0] - phi[1] * a[0];
   e = phi[0] - mul32x32(phi[1],a[0]);

   for(int p=1; p < nbCoefs; p++)
   {
      q63_t suma=0;
      q63_t sumb=0;
      q31x4_t vecA,vecRevPhi,vecPhi;
      q31_t k;
      uint32_t blkCnt;
      const q31_t *pPhi,*pRevPhi,*pA;
      uint32x4_t revOffset;


      int nb,j,i;

      revOffset = vld1q(revOffsetArray);

      pRevPhi = &phi[p-3];
      pPhi = &phi[1];
      pA = a;

      i = 0;
      blkCnt = p >> 2;
      while(blkCnt > 0)
      {
         vecA = vld1q(pA);
         pA += 4;

         vecPhi = vld1q(pPhi);
         pPhi += 4;

         vecRevPhi = vldrwq_gather_shifted_offset_s32(pRevPhi,revOffset);
         pRevPhi -= 4;

         suma = vmlaldavaq(suma,vecA,vecRevPhi);
         sumb = vmlaldavaq(sumb,vecA,vecPhi);

         i += 4;
         blkCnt--;
      }


      blkCnt = p & 3;
      while(blkCnt > 0)
      {
         suma += ((q63_t)a[i] * phi[p - i]);
         sumb += ((q63_t)a[i] * phi[i + 1]);

         i++;
         blkCnt--;
      }

      suma = asrl(suma, 31);
      sumb = asrl(sumb, 31);


      //k = (phi[p+1]-suma)/(phi[0] - sumb);
      k = divide(phi[p+1]-(q31_t)suma,phi[0] - (q31_t)sumb);

      q31x4_t vecRevA,tmp;
      static int32_t orgOffsetArray[4]={0,1,-1,-2};
      static const int32_t offsetIncArray[4]={2,2,-2,-2};

      uint32x4_t offset,offsetInc,vecTmp;


      offset = vld1q_u32((uint32_t*)orgOffsetArray);
      vecTmp = vdupq_n_u32(p);

      offset = vaddq_m_u32(offset,offset,vecTmp,LANE23_MASK);
      offsetInc = vld1q_u32((uint32_t*)offsetIncArray);


      nb = p >> 2;
      j=0;
      for(int i =0;i < nb ; i++)
      {
        /*
          q31_t x0,x1,x2,x3;

          //x = a[j] - k * a[p-1-j];
          x0 = a[j] - mul32x32(k,a[p-1-j]);
          x1 = a[j+1] - mul32x32(k,a[p-2-j]);

          //y = a[p-1-j] - k * a[j];
          x2 = a[p-1-j] - mul32x32(k , a[j]);
          x3 = a[p-2-j] - mul32x32(k , a[j+1]);

          a[j] = x0;
          a[j+1] = x1;
          a[p-1-j] = x2;
          a[p-2-j] = x3;
        */

          uint64_t tmpa,tmpb;
          vecA = vldrwq_gather_shifted_offset_s32(a,offset);


          tmpa = vgetq_lane_u64((uint64x2_t)vecA,0);
          tmpb = vgetq_lane_u64((uint64x2_t)vecA,1);
          vecRevA = (q31x4_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0);
          vecRevA = (q31x4_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1);


          tmp = vsubq(vecA,vqdmulhq_n_s32(vecRevA,k));
          vstrwq_scatter_shifted_offset_s32(a, offset, tmp);

          offset = vaddq(offset,offsetInc);

          j+=2;
      }

      switch(p & 3)
      {
         case 3:
         {
          q31_t x,y;

          //x = a[j] - k * a[p-1-j];
          x = a[j] - mul32x32(k,a[p-1-j]);

          //y = a[p-1-j] - k * a[j];
          y = a[p-1-j] - mul32x32(k , a[j]);

          a[j] = x;
          a[p-1-j] = y;

          //a[j] = a[j]- k * a[p-1-j];
          a[j+1] = a[j+1] - mul32x32(k,a[p-2-j]);
         }
         break;

         case 2:
         {
          q31_t x,y;

          //x = a[j] - k * a[p-1-j];
          x = a[j] - mul32x32(k,a[p-1-j]);

          //y = a[p-1-j] - k * a[j];
          y = a[p-1-j] - mul32x32(k , a[j]);

          a[j] = x;
          a[p-1-j] = y;
         }
         break;

         case 1:
            //a[j] = a[j]- k * a[p-1-j];
            a[j] = a[j] - mul32x32(k,a[p-1-j]);
         break;
      }

      a[p] = k;

      // e = e * (1 - k*k);
      e = mul32x32(e,ONE_Q31 - mul32x32(k,k));


   }
   *err = e;
}

#else

ARM_DSP_ATTRIBUTE void arm_levinson_durbin_q31(const q31_t *phi,
  q31_t *a,
  q31_t *err,
  int nbCoefs)
{
   q31_t e;
   int p;

   //a[0] = phi[1] / phi[0];
   a[0] = divide(phi[1], phi[0]);


   //e = phi[0] - phi[1] * a[0];
   e = phi[0] - mul32x32(phi[1],a[0]);

   for(p=1; p < nbCoefs; p++)
   {
      q63_t suma=0;
      q63_t sumb=0;
      q31_t k;
      int nb,j,i;

      for(i=0; i < p; i++)
      {
         suma += ((q63_t)a[i] * phi[p - i]);
         sumb += ((q63_t)a[i] * phi[i + 1]);
      }

      suma = suma >> 31;
      sumb = sumb >> 31;


      //k = (phi[p+1]-suma)/(phi[0] - sumb);
      k = divide(phi[p+1]-(q31_t)suma,phi[0] - (q31_t)sumb);


      nb = p >> 1;
      j=0;
      for(i =0;i < nb ; i++)
      {
          q31_t x,y;

          //x = a[j] - k * a[p-1-j];
          x = a[j] - mul32x32(k,a[p-1-j]);

          //y = a[p-1-j] - k * a[j];
          y = a[p-1-j] - mul32x32(k , a[j]);

          a[j] = x;
          a[p-1-j] = y;

          j++;
      }

      nb = p & 1;
      if (nb)
      {
            //a[j] = a[j]- k * a[p-1-j];
            a[j] = a[j] - mul32x32(k,a[p-1-j]);
      }

      a[p] = k;

      // e = e * (1 - k*k);
      e = mul32x32(e,ONE_Q31 - mul32x32(k,k));


   }
   *err = e;
}
#endif /* defined(ARM_MATH_MVEI) */

/**
  @} end of LD group
 */