webrtc/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S - src - Git at Google

 @
 @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 @
 @ Use of this source code is governed by a BSD-style license
 @ that can be found in the LICENSE file in the root of the source
 @ tree. An additional intellectual property rights grant can be found
 @ in the file PATENTS.  All contributing project authors may
 @ be found in the AUTHORS file in the root of the source tree.
 @

 @ Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon()
 @ in iSAC codec, optimized for ARM Neon platform. Bit exact with function
 @ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
 @ C code is at end of this file.

 #include "webrtc/system_wrappers/interface/asm_defines.h"

 GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
 .align  2

 @void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
 @    int16_t *data_ch1,  // Input and output in channel 1, in Q0
 @    int16_t *data_ch2,  // Input and output in channel 2, in Q0
 @    const int16_t *factor_ch1,  // Scaling factor for channel 1, in Q15
 @    const int16_t *factor_ch2,  // Scaling factor for channel 2, in Q15
 @    const int length,           // Length of the data buffers
 @    int32_t *filter_state_ch1,  // Filter state for channel 1, in Q16
 @    int32_t *filter_state_ch2); // Filter state for channel 2, in Q16

 DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
   push {r4 - r7}

   ldr r5, [sp, #24]           @ filter_state_ch2
   ldr r6, [sp, #20]           @ filter_state_ch1

   @ Initialize the Neon registers.
   vld1.16 d0[0], [r0]!        @ data_ch1[0]
   vld1.16 d0[2], [r1]!        @ data_ch2[0]
   vld1.32 d30[0], [r2]        @ factor_ch1[0], factor_ch1[1]
   vld1.32 d30[1], [r3]        @ factor_ch2[0], factor_ch2[1]
   vld1.32 d16[0], [r6]!       @ filter_state_ch1[0]
   vld1.32 d17[0], [r5]!       @ filter_state_ch2[0]
   vneg.s16 d31, d30

   ldr r3, [sp, #16]           @ length
   mov r4, #4                  @ Post offset value for the loop
   mov r2, #-2                 @ Post offset value for the loop
   sub r3, #2                  @ Loop counter

   @ Loop unrolling pre-processing.
   vqdmull.s16 q1, d30, d0
   vshll.s16 q0, d0, #16
   vqadd.s32 q2, q1, q8
   vshrn.i32 d6, q2, #16
   vmull.s16 q1, d31, d6
   vshl.s32 q1, #1
   vqadd.s32 q8, q1, q0
   vld1.32 d16[1], [r6]        @ filter_state_ch1[1]
   vld1.32 d17[1], [r5]        @ filter_state_ch2[1]
   sub r6, #4                  @ &filter_state_ch1[0]
   sub r5, #4                  @ &filter_state_ch2[0]
   vld1.16 d6[1], [r0], r2     @ data_ch1[1]
   vld1.16 d6[3], [r1], r2     @ data_ch2[1]
   vrev32.16 d0, d6

 FOR_LOOP:
   vqdmull.s16 q1, d30, d0
   vshll.s16 q0, d0, #16
   vqadd.s32 q2, q1, q8
   vshrn.i32 d4, q2, #16
   vmull.s16 q1, d31, d4
   vst1.16 d4[1], [r0], r4     @ Store data_ch1[n]
   vst1.16 d4[3], [r1], r4     @ Store data_ch2[n]
   vshl.s32 q1, #1
   vld1.16 d4[1], [r0], r2     @ Load data_ch1[n + 2]
   vld1.16 d4[3], [r1], r2     @ Load data_ch2[n + 2]
   vqadd.s32 q8, q1, q0
   vrev32.16 d0, d4
   vqdmull.s16 q1, d30, d0
   subs r3, #2
   vqadd.s32 q2, q1, q8
   vshrn.i32 d6, q2, #16
   vmull.s16 q1, d31, d6
   vshll.s16 q0, d0, #16
   vst1.16 d6[1], [r0], r4     @ Store data_ch1[n + 1]
   vst1.16 d6[3], [r1], r4     @ Store data_ch2[n + 1]
   vshl.s32 q1, #1
   vld1.16 d6[1], [r0], r2     @ Load data_ch1[n + 3]
   vld1.16 d6[3], [r1], r2     @ Load data_ch2[n + 3]
   vqadd.s32 q8, q1, q0
   vrev32.16 d0, d6
   bgt FOR_LOOP

   @ Loop unrolling post-processing.
   vqdmull.s16 q1, d30, d0
   vshll.s16 q0, d0, #16
   vqadd.s32 q2, q1, q8
   vshrn.i32 d4, q2, #16
   vmull.s16 q1, d31, d4
   vst1.16 d4[1], [r0]!        @ Store data_ch1[n]
   vst1.16 d4[3], [r1]!        @ Store data_ch2[n]
   vshl.s32 q1, #1
   vqadd.s32 q8, q1, q0
   vrev32.16 d0, d4
   vqdmull.s16 q1, d30, d0
   vshll.s16 q0, d0, #16
   vqadd.s32 q2, q1, q8
   vshrn.i32 d6, q2, #16
   vmull.s16 q1, d31, d6
   vst1.16 d6[1], [r0]         @ Store data_ch1[n + 1]
   vst1.16 d6[3], [r1]         @ Store data_ch2[n + 1]
   vshl.s32 q1, #1
   vst1.32 d16[0], [r6]!       @ Store filter_state_ch1[0]
   vqadd.s32 q9, q1, q0
   vst1.32 d17[0], [r5]!       @ Store filter_state_ch1[1]
   vst1.32 d18[1], [r6]        @ Store filter_state_ch2[0]
   vst1.32 d19[1], [r5]        @ Store filter_state_ch2[1]

   pop {r4 - r7}
   bx lr

 @void AllpassFilter2FixDec16BothChannels(
 @    int16_t *data_ch1,  // Input and output in channel 1, in Q0
 @    int16_t *data_ch2,  // Input and output in channel 2, in Q0
 @    const int16_t *factor_ch1,  // Scaling factor for channel 1, in Q15
 @    const int16_t *factor_ch2,  // Scaling factor for channel 2, in Q15
 @    const int length,  // Length of the data buffers
 @    int32_t *filter_state_ch1,  // Filter state for channel 1, in Q16
 @    int32_t *filter_state_ch2) {  // Filter state for channel 2, in Q16
 @  int n = 0;
 @  int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1];
 @  int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1];
 @  int16_t sample0_ch1 = 0, sample0_ch2 = 0;
 @  int16_t sample1_ch1 = 0, sample1_ch2  = 0;
 @  int32_t a0_ch1 = 0, a0_ch2 = 0;
 @  int32_t b0_ch1 = 0, b0_ch2 = 0;
 @
 @  int32_t a1_ch1 = 0, a1_ch2 = 0;
 @  int32_t b1_ch1 = 0, b1_ch2 = 0;
 @  int32_t b2_ch1  = 0, b2_ch2 = 0;
 @
 @  // Loop unrolling preprocessing.
 @
 @  sample0_ch1 = data_ch1[n];
 @  sample0_ch2 = data_ch2[n];
 @
 @  a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
 @  a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
 @
 @  b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
 @  b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
 @
 @  a0_ch1 = -factor_ch1[0] * (int16_t)(b0_ch1 >> 16);
 @  a0_ch2 = -factor_ch2[0] * (int16_t)(b0_ch2 >> 16);
 @
 @  state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16);
 @  state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16);
 @
 @  sample1_ch1 = data_ch1[n + 1];
 @  sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0
 @  sample1_ch2  = data_ch2[n + 1];
 @  sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0
 @
 @
 @  for (n = 0; n < length - 2; n += 2) {
 @    a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
 @    a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
 @    a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
 @    a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
 @
 @    b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
 @    b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1); //Q16+Q16=Q16
 @    b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2); //Q16+Q16=Q16
 @    b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2); //Q16+Q16=Q16
 @
 @    a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
 @    a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
 @    a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
 @    a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
 @
 @    state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16);
 @    state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16);
 @    state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16);
 @    state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16);
 @
 @    sample0_ch1 = data_ch1[n + 2];
 @    sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
 @    sample0_ch2 = data_ch2[n + 2];
 @    sample1_ch2  = (int16_t) (b1_ch2 >> 16); //Save as Q0
 @
 @    a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
 @    a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
 @    a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
 @    a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
 @
 @    b2_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
 @    b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
 @    b2_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
 @    b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
 @
 @    a0_ch1 = -factor_ch1[0] * (int16_t)(b2_ch1 >> 16);
 @    a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
 @    a0_ch2 = -factor_ch2[0] * (int16_t)(b2_ch2 >> 16);
 @    a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
 @
 @    state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1<<16);
 @    state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
 @    state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2<<16);
 @    state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
 @
 @
 @    sample1_ch1 = data_ch1[n + 3];
 @    sample0_ch1 = (int16_t) (b2_ch1  >> 16); //Save as Q0
 @    sample1_ch2 = data_ch2[n + 3];
 @    sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0
 @
 @    data_ch1[n]     = (int16_t) (b0_ch1 >> 16); //Save as Q0
 @    data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
 @    data_ch2[n]     = (int16_t) (b0_ch2 >> 16);
 @    data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
 @  }
 @
 @  // Loop unrolling post-processing.
 @
 @  a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
 @  a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
 @  a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
 @  a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
 @
 @  b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
 @  b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1);
 @  b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2);
 @  b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2);
 @
 @  a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
 @  a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
 @  a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
 @  a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
 @
 @  state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16);
 @  state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16);
 @  state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16);
 @  state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16);
 @
 @  data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
 @  data_ch2[n] = (int16_t) (b0_ch2 >> 16);
 @
 @  sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
 @  sample1_ch2  = (int16_t) (b1_ch2 >> 16); //Save as Q0
 @
 @  a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
 @  a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
 @
 @  b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
 @  b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
 @
 @  a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
 @  a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
 @
 @  state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
 @  state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
 @
 @  data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
 @  data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
 @
 @  filter_state_ch1[0] = state0_ch1;
 @  filter_state_ch1[1] = state1_ch1;
 @  filter_state_ch2[0] = state0_ch2;
 @  filter_state_ch2[1] = state1_ch2;
 @}
	@
	@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
	@
	@ Use of this source code is governed by a BSD-style license
	@ that can be found in the LICENSE file in the root of the source
	@ tree. An additional intellectual property rights grant can be found
	@ in the file PATENTS. All contributing project authors may
	@ be found in the AUTHORS file in the root of the source tree.
	@

	@ Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon()
	@ in iSAC codec, optimized for ARM Neon platform. Bit exact with function
	@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
	@ C code is at end of this file.

	#include "webrtc/system_wrappers/interface/asm_defines.h"

	GLOBAL_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
	.align 2

	@void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
	@ int16_t *data_ch1, // Input and output in channel 1, in Q0
	@ int16_t *data_ch2, // Input and output in channel 2, in Q0
	@ const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15
	@ const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15
	@ const int length, // Length of the data buffers
	@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
	@ int32_t *filter_state_ch2); // Filter state for channel 2, in Q16

	DEFINE_FUNCTION WebRtcIsacfix_AllpassFilter2FixDec16Neon
	push {r4 - r7}

	ldr r5, [sp, #24] @ filter_state_ch2
	ldr r6, [sp, #20] @ filter_state_ch1

	@ Initialize the Neon registers.
	vld1.16 d0[0], [r0]! @ data_ch1[0]
	vld1.16 d0[2], [r1]! @ data_ch2[0]
	vld1.32 d30[0], [r2] @ factor_ch1[0], factor_ch1[1]
	vld1.32 d30[1], [r3] @ factor_ch2[0], factor_ch2[1]
	vld1.32 d16[0], [r6]! @ filter_state_ch1[0]
	vld1.32 d17[0], [r5]! @ filter_state_ch2[0]
	vneg.s16 d31, d30

	ldr r3, [sp, #16] @ length
	mov r4, #4 @ Post offset value for the loop
	mov r2, #-2 @ Post offset value for the loop
	sub r3, #2 @ Loop counter

	@ Loop unrolling pre-processing.
	vqdmull.s16 q1, d30, d0
	vshll.s16 q0, d0, #16
	vqadd.s32 q2, q1, q8
	vshrn.i32 d6, q2, #16
	vmull.s16 q1, d31, d6
	vshl.s32 q1, #1
	vqadd.s32 q8, q1, q0
	vld1.32 d16[1], [r6] @ filter_state_ch1[1]
	vld1.32 d17[1], [r5] @ filter_state_ch2[1]
	sub r6, #4 @ &filter_state_ch1[0]
	sub r5, #4 @ &filter_state_ch2[0]
	vld1.16 d6[1], [r0], r2 @ data_ch1[1]
	vld1.16 d6[3], [r1], r2 @ data_ch2[1]
	vrev32.16 d0, d6

	FOR_LOOP:
	vqdmull.s16 q1, d30, d0
	vshll.s16 q0, d0, #16
	vqadd.s32 q2, q1, q8
	vshrn.i32 d4, q2, #16
	vmull.s16 q1, d31, d4
	vst1.16 d4[1], [r0], r4 @ Store data_ch1[n]
	vst1.16 d4[3], [r1], r4 @ Store data_ch2[n]
	vshl.s32 q1, #1
	vld1.16 d4[1], [r0], r2 @ Load data_ch1[n + 2]
	vld1.16 d4[3], [r1], r2 @ Load data_ch2[n + 2]
	vqadd.s32 q8, q1, q0
	vrev32.16 d0, d4
	vqdmull.s16 q1, d30, d0
	subs r3, #2
	vqadd.s32 q2, q1, q8
	vshrn.i32 d6, q2, #16
	vmull.s16 q1, d31, d6
	vshll.s16 q0, d0, #16
	vst1.16 d6[1], [r0], r4 @ Store data_ch1[n + 1]
	vst1.16 d6[3], [r1], r4 @ Store data_ch2[n + 1]
	vshl.s32 q1, #1
	vld1.16 d6[1], [r0], r2 @ Load data_ch1[n + 3]
	vld1.16 d6[3], [r1], r2 @ Load data_ch2[n + 3]
	vqadd.s32 q8, q1, q0
	vrev32.16 d0, d6
	bgt FOR_LOOP

	@ Loop unrolling post-processing.
	vqdmull.s16 q1, d30, d0
	vshll.s16 q0, d0, #16
	vqadd.s32 q2, q1, q8
	vshrn.i32 d4, q2, #16
	vmull.s16 q1, d31, d4
	vst1.16 d4[1], [r0]! @ Store data_ch1[n]
	vst1.16 d4[3], [r1]! @ Store data_ch2[n]
	vshl.s32 q1, #1
	vqadd.s32 q8, q1, q0
	vrev32.16 d0, d4
	vqdmull.s16 q1, d30, d0
	vshll.s16 q0, d0, #16
	vqadd.s32 q2, q1, q8
	vshrn.i32 d6, q2, #16
	vmull.s16 q1, d31, d6
	vst1.16 d6[1], [r0] @ Store data_ch1[n + 1]
	vst1.16 d6[3], [r1] @ Store data_ch2[n + 1]
	vshl.s32 q1, #1
	vst1.32 d16[0], [r6]! @ Store filter_state_ch1[0]
	vqadd.s32 q9, q1, q0
	vst1.32 d17[0], [r5]! @ Store filter_state_ch1[1]
	vst1.32 d18[1], [r6] @ Store filter_state_ch2[0]
	vst1.32 d19[1], [r5] @ Store filter_state_ch2[1]

	pop {r4 - r7}
	bx lr

	@void AllpassFilter2FixDec16BothChannels(
	@ int16_t *data_ch1, // Input and output in channel 1, in Q0
	@ int16_t *data_ch2, // Input and output in channel 2, in Q0
	@ const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15
	@ const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15
	@ const int length, // Length of the data buffers
	@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
	@ int32_t *filter_state_ch2) { // Filter state for channel 2, in Q16
	@ int n = 0;
	@ int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1];
	@ int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1];
	@ int16_t sample0_ch1 = 0, sample0_ch2 = 0;
	@ int16_t sample1_ch1 = 0, sample1_ch2 = 0;
	@ int32_t a0_ch1 = 0, a0_ch2 = 0;
	@ int32_t b0_ch1 = 0, b0_ch2 = 0;
	@
	@ int32_t a1_ch1 = 0, a1_ch2 = 0;
	@ int32_t b1_ch1 = 0, b1_ch2 = 0;
	@ int32_t b2_ch1 = 0, b2_ch2 = 0;
	@
	@ // Loop unrolling preprocessing.
	@
	@ sample0_ch1 = data_ch1[n];
	@ sample0_ch2 = data_ch2[n];
	@
	@ a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
	@ a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
	@
	@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
	@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
	@
	@ a0_ch1 = -factor_ch1[0] * (int16_t)(b0_ch1 >> 16);
	@ a0_ch2 = -factor_ch2[0] * (int16_t)(b0_ch2 >> 16);
	@
	@ state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16);
	@ state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16);
	@
	@ sample1_ch1 = data_ch1[n + 1];
	@ sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0
	@ sample1_ch2 = data_ch2[n + 1];
	@ sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0
	@
	@
	@ for (n = 0; n < length - 2; n += 2) {
	@ a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
	@ a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
	@ a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
	@ a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
	@
	@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
	@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1); //Q16+Q16=Q16
	@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2); //Q16+Q16=Q16
	@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2); //Q16+Q16=Q16
	@
	@ a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
	@ a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
	@ a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
	@ a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
	@
	@ state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16);
	@ state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16);
	@ state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16);
	@ state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16);
	@
	@ sample0_ch1 = data_ch1[n + 2];
	@ sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
	@ sample0_ch2 = data_ch2[n + 2];
	@ sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0
	@
	@ a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
	@ a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
	@ a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
	@ a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
	@
	@ b2_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
	@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
	@ b2_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
	@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
	@
	@ a0_ch1 = -factor_ch1[0] * (int16_t)(b2_ch1 >> 16);
	@ a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
	@ a0_ch2 = -factor_ch2[0] * (int16_t)(b2_ch2 >> 16);
	@ a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
	@
	@ state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1<<16);
	@ state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
	@ state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2<<16);
	@ state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
	@
	@
	@ sample1_ch1 = data_ch1[n + 3];
	@ sample0_ch1 = (int16_t) (b2_ch1 >> 16); //Save as Q0
	@ sample1_ch2 = data_ch2[n + 3];
	@ sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0
	@
	@ data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
	@ data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
	@ data_ch2[n] = (int16_t) (b0_ch2 >> 16);
	@ data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
	@ }
	@
	@ // Loop unrolling post-processing.
	@
	@ a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
	@ a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
	@ a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
	@ a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
	@
	@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
	@ b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1);
	@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2);
	@ b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2);
	@
	@ a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
	@ a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
	@ a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
	@ a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
	@
	@ state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16);
	@ state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16);
	@ state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16);
	@ state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16);
	@
	@ data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
	@ data_ch2[n] = (int16_t) (b0_ch2 >> 16);
	@
	@ sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
	@ sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0
	@
	@ a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
	@ a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
	@
	@ b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
	@ b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
	@
	@ a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
	@ a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
	@
	@ state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
	@ state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
	@
	@ data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
	@ data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
	@
	@ filter_state_ch1[0] = state0_ch1;
	@ filter_state_ch1[1] = state1_ch1;
	@ filter_state_ch2[0] = state0_ch2;
	@ filter_state_ch2[1] = state1_ch2;
	@}