blob: a77ec195c71348b42e8420c1baab4f759170f861 [file] [log] [blame]
/*
* Copyright (C) 2012 Intel Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "platform/audio/DirectConvolver.h"
#if OS(MACOSX)
#include <Accelerate/Accelerate.h>
#endif
#include "platform/audio/VectorMath.h"
#include "wtf/CPU.h"
#if (CPU(X86) || CPU(X86_64)) && !OS(MACOSX)
#include <emmintrin.h>
#endif
namespace blink {
using namespace VectorMath;
DirectConvolver::DirectConvolver(size_t inputBlockSize)
: m_inputBlockSize(inputBlockSize), m_buffer(inputBlockSize * 2) {}
void DirectConvolver::process(AudioFloatArray* convolutionKernel,
const float* sourceP,
float* destP,
size_t framesToProcess) {
ASSERT(framesToProcess == m_inputBlockSize);
if (framesToProcess != m_inputBlockSize)
return;
// Only support kernelSize <= m_inputBlockSize
size_t kernelSize = convolutionKernel->size();
ASSERT(kernelSize <= m_inputBlockSize);
if (kernelSize > m_inputBlockSize)
return;
float* kernelP = convolutionKernel->data();
// Sanity check
bool isCopyGood = kernelP && sourceP && destP && m_buffer.data();
ASSERT(isCopyGood);
if (!isCopyGood)
return;
float* inputP = m_buffer.data() + m_inputBlockSize;
// Copy samples to 2nd half of input buffer.
memcpy(inputP, sourceP, sizeof(float) * framesToProcess);
#if OS(MACOSX)
#if CPU(X86)
conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1,
framesToProcess, kernelSize);
#else
vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1,
framesToProcess, kernelSize);
#endif // CPU(X86)
#else
size_t i = 0;
#if CPU(X86) || CPU(X86_64)
// Convolution using SSE2. Currently only do this if both |kernelSize| and
// |framesToProcess| are multiples of 4. If not, use the straightforward loop
// below.
if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) {
// AudioFloatArray's are always aligned on at least a 16-byte boundary.
AudioFloatArray kernelBuffer(4 * kernelSize);
__m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data());
// Reverse the kernel and repeat each value across a vector
for (i = 0; i < kernelSize; ++i) {
kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]);
}
float* inputStartP = inputP - kernelSize + 1;
// Do convolution with 4 inputs at a time.
for (i = 0; i < framesToProcess; i += 4) {
__m128 convolutionSum;
convolutionSum = _mm_setzero_ps();
// |kernelSize| is a multiple of 4 so we can unroll the loop by 4,
// manually.
for (size_t k = 0; k < kernelSize; k += 4) {
size_t dataOffset = i + k;
for (size_t m = 0; m < 4; ++m) {
__m128 sourceBlock;
__m128 product;
sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m);
product = _mm_mul_ps(kernelReversed[k + m], sourceBlock);
convolutionSum = _mm_add_ps(convolutionSum, product);
}
}
_mm_storeu_ps(destP + i, convolutionSum);
}
} else {
#endif
// FIXME: The macro can be further optimized to avoid pipeline stalls. One
// possibility is to maintain 4 separate sums and change the macro to
// CONVOLVE_FOUR_SAMPLES.
#define CONVOLVE_ONE_SAMPLE \
do { \
sum += inputP[i - j] * kernelP[j]; \
j++; \
} while (0)
while (i < framesToProcess) {
size_t j = 0;
float sum = 0;
// FIXME: SSE optimization may be applied here.
if (kernelSize == 32) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
CONVOLVE_ONE_SAMPLE; // 4
CONVOLVE_ONE_SAMPLE; // 5
CONVOLVE_ONE_SAMPLE; // 6
CONVOLVE_ONE_SAMPLE; // 7
CONVOLVE_ONE_SAMPLE; // 8
CONVOLVE_ONE_SAMPLE; // 9
CONVOLVE_ONE_SAMPLE; // 10
CONVOLVE_ONE_SAMPLE; // 11
CONVOLVE_ONE_SAMPLE; // 12
CONVOLVE_ONE_SAMPLE; // 13
CONVOLVE_ONE_SAMPLE; // 14
CONVOLVE_ONE_SAMPLE; // 15
CONVOLVE_ONE_SAMPLE; // 16
CONVOLVE_ONE_SAMPLE; // 17
CONVOLVE_ONE_SAMPLE; // 18
CONVOLVE_ONE_SAMPLE; // 19
CONVOLVE_ONE_SAMPLE; // 20
CONVOLVE_ONE_SAMPLE; // 21
CONVOLVE_ONE_SAMPLE; // 22
CONVOLVE_ONE_SAMPLE; // 23
CONVOLVE_ONE_SAMPLE; // 24
CONVOLVE_ONE_SAMPLE; // 25
CONVOLVE_ONE_SAMPLE; // 26
CONVOLVE_ONE_SAMPLE; // 27
CONVOLVE_ONE_SAMPLE; // 28
CONVOLVE_ONE_SAMPLE; // 29
CONVOLVE_ONE_SAMPLE; // 30
CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32
} else if (kernelSize == 64) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
CONVOLVE_ONE_SAMPLE; // 4
CONVOLVE_ONE_SAMPLE; // 5
CONVOLVE_ONE_SAMPLE; // 6
CONVOLVE_ONE_SAMPLE; // 7
CONVOLVE_ONE_SAMPLE; // 8
CONVOLVE_ONE_SAMPLE; // 9
CONVOLVE_ONE_SAMPLE; // 10
CONVOLVE_ONE_SAMPLE; // 11
CONVOLVE_ONE_SAMPLE; // 12
CONVOLVE_ONE_SAMPLE; // 13
CONVOLVE_ONE_SAMPLE; // 14
CONVOLVE_ONE_SAMPLE; // 15
CONVOLVE_ONE_SAMPLE; // 16
CONVOLVE_ONE_SAMPLE; // 17
CONVOLVE_ONE_SAMPLE; // 18
CONVOLVE_ONE_SAMPLE; // 19
CONVOLVE_ONE_SAMPLE; // 20
CONVOLVE_ONE_SAMPLE; // 21
CONVOLVE_ONE_SAMPLE; // 22
CONVOLVE_ONE_SAMPLE; // 23
CONVOLVE_ONE_SAMPLE; // 24
CONVOLVE_ONE_SAMPLE; // 25
CONVOLVE_ONE_SAMPLE; // 26
CONVOLVE_ONE_SAMPLE; // 27
CONVOLVE_ONE_SAMPLE; // 28
CONVOLVE_ONE_SAMPLE; // 29
CONVOLVE_ONE_SAMPLE; // 30
CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32
CONVOLVE_ONE_SAMPLE; // 33
CONVOLVE_ONE_SAMPLE; // 34
CONVOLVE_ONE_SAMPLE; // 35
CONVOLVE_ONE_SAMPLE; // 36
CONVOLVE_ONE_SAMPLE; // 37
CONVOLVE_ONE_SAMPLE; // 38
CONVOLVE_ONE_SAMPLE; // 39
CONVOLVE_ONE_SAMPLE; // 40
CONVOLVE_ONE_SAMPLE; // 41
CONVOLVE_ONE_SAMPLE; // 42
CONVOLVE_ONE_SAMPLE; // 43
CONVOLVE_ONE_SAMPLE; // 44
CONVOLVE_ONE_SAMPLE; // 45
CONVOLVE_ONE_SAMPLE; // 46
CONVOLVE_ONE_SAMPLE; // 47
CONVOLVE_ONE_SAMPLE; // 48
CONVOLVE_ONE_SAMPLE; // 49
CONVOLVE_ONE_SAMPLE; // 50
CONVOLVE_ONE_SAMPLE; // 51
CONVOLVE_ONE_SAMPLE; // 52
CONVOLVE_ONE_SAMPLE; // 53
CONVOLVE_ONE_SAMPLE; // 54
CONVOLVE_ONE_SAMPLE; // 55
CONVOLVE_ONE_SAMPLE; // 56
CONVOLVE_ONE_SAMPLE; // 57
CONVOLVE_ONE_SAMPLE; // 58
CONVOLVE_ONE_SAMPLE; // 59
CONVOLVE_ONE_SAMPLE; // 60
CONVOLVE_ONE_SAMPLE; // 61
CONVOLVE_ONE_SAMPLE; // 62
CONVOLVE_ONE_SAMPLE; // 63
CONVOLVE_ONE_SAMPLE; // 64
} else if (kernelSize == 128) {
CONVOLVE_ONE_SAMPLE; // 1
CONVOLVE_ONE_SAMPLE; // 2
CONVOLVE_ONE_SAMPLE; // 3
CONVOLVE_ONE_SAMPLE; // 4
CONVOLVE_ONE_SAMPLE; // 5
CONVOLVE_ONE_SAMPLE; // 6
CONVOLVE_ONE_SAMPLE; // 7
CONVOLVE_ONE_SAMPLE; // 8
CONVOLVE_ONE_SAMPLE; // 9
CONVOLVE_ONE_SAMPLE; // 10
CONVOLVE_ONE_SAMPLE; // 11
CONVOLVE_ONE_SAMPLE; // 12
CONVOLVE_ONE_SAMPLE; // 13
CONVOLVE_ONE_SAMPLE; // 14
CONVOLVE_ONE_SAMPLE; // 15
CONVOLVE_ONE_SAMPLE; // 16
CONVOLVE_ONE_SAMPLE; // 17
CONVOLVE_ONE_SAMPLE; // 18
CONVOLVE_ONE_SAMPLE; // 19
CONVOLVE_ONE_SAMPLE; // 20
CONVOLVE_ONE_SAMPLE; // 21
CONVOLVE_ONE_SAMPLE; // 22
CONVOLVE_ONE_SAMPLE; // 23
CONVOLVE_ONE_SAMPLE; // 24
CONVOLVE_ONE_SAMPLE; // 25
CONVOLVE_ONE_SAMPLE; // 26
CONVOLVE_ONE_SAMPLE; // 27
CONVOLVE_ONE_SAMPLE; // 28
CONVOLVE_ONE_SAMPLE; // 29
CONVOLVE_ONE_SAMPLE; // 30
CONVOLVE_ONE_SAMPLE; // 31
CONVOLVE_ONE_SAMPLE; // 32
CONVOLVE_ONE_SAMPLE; // 33
CONVOLVE_ONE_SAMPLE; // 34
CONVOLVE_ONE_SAMPLE; // 35
CONVOLVE_ONE_SAMPLE; // 36
CONVOLVE_ONE_SAMPLE; // 37
CONVOLVE_ONE_SAMPLE; // 38
CONVOLVE_ONE_SAMPLE; // 39
CONVOLVE_ONE_SAMPLE; // 40
CONVOLVE_ONE_SAMPLE; // 41
CONVOLVE_ONE_SAMPLE; // 42
CONVOLVE_ONE_SAMPLE; // 43
CONVOLVE_ONE_SAMPLE; // 44
CONVOLVE_ONE_SAMPLE; // 45
CONVOLVE_ONE_SAMPLE; // 46
CONVOLVE_ONE_SAMPLE; // 47
CONVOLVE_ONE_SAMPLE; // 48
CONVOLVE_ONE_SAMPLE; // 49
CONVOLVE_ONE_SAMPLE; // 50
CONVOLVE_ONE_SAMPLE; // 51
CONVOLVE_ONE_SAMPLE; // 52
CONVOLVE_ONE_SAMPLE; // 53
CONVOLVE_ONE_SAMPLE; // 54
CONVOLVE_ONE_SAMPLE; // 55
CONVOLVE_ONE_SAMPLE; // 56
CONVOLVE_ONE_SAMPLE; // 57
CONVOLVE_ONE_SAMPLE; // 58
CONVOLVE_ONE_SAMPLE; // 59
CONVOLVE_ONE_SAMPLE; // 60
CONVOLVE_ONE_SAMPLE; // 61
CONVOLVE_ONE_SAMPLE; // 62
CONVOLVE_ONE_SAMPLE; // 63
CONVOLVE_ONE_SAMPLE; // 64
CONVOLVE_ONE_SAMPLE; // 65
CONVOLVE_ONE_SAMPLE; // 66
CONVOLVE_ONE_SAMPLE; // 67
CONVOLVE_ONE_SAMPLE; // 68
CONVOLVE_ONE_SAMPLE; // 69
CONVOLVE_ONE_SAMPLE; // 70
CONVOLVE_ONE_SAMPLE; // 71
CONVOLVE_ONE_SAMPLE; // 72
CONVOLVE_ONE_SAMPLE; // 73
CONVOLVE_ONE_SAMPLE; // 74
CONVOLVE_ONE_SAMPLE; // 75
CONVOLVE_ONE_SAMPLE; // 76
CONVOLVE_ONE_SAMPLE; // 77
CONVOLVE_ONE_SAMPLE; // 78
CONVOLVE_ONE_SAMPLE; // 79
CONVOLVE_ONE_SAMPLE; // 80
CONVOLVE_ONE_SAMPLE; // 81
CONVOLVE_ONE_SAMPLE; // 82
CONVOLVE_ONE_SAMPLE; // 83
CONVOLVE_ONE_SAMPLE; // 84
CONVOLVE_ONE_SAMPLE; // 85
CONVOLVE_ONE_SAMPLE; // 86
CONVOLVE_ONE_SAMPLE; // 87
CONVOLVE_ONE_SAMPLE; // 88
CONVOLVE_ONE_SAMPLE; // 89
CONVOLVE_ONE_SAMPLE; // 90
CONVOLVE_ONE_SAMPLE; // 91
CONVOLVE_ONE_SAMPLE; // 92
CONVOLVE_ONE_SAMPLE; // 93
CONVOLVE_ONE_SAMPLE; // 94
CONVOLVE_ONE_SAMPLE; // 95
CONVOLVE_ONE_SAMPLE; // 96
CONVOLVE_ONE_SAMPLE; // 97
CONVOLVE_ONE_SAMPLE; // 98
CONVOLVE_ONE_SAMPLE; // 99
CONVOLVE_ONE_SAMPLE; // 100
CONVOLVE_ONE_SAMPLE; // 101
CONVOLVE_ONE_SAMPLE; // 102
CONVOLVE_ONE_SAMPLE; // 103
CONVOLVE_ONE_SAMPLE; // 104
CONVOLVE_ONE_SAMPLE; // 105
CONVOLVE_ONE_SAMPLE; // 106
CONVOLVE_ONE_SAMPLE; // 107
CONVOLVE_ONE_SAMPLE; // 108
CONVOLVE_ONE_SAMPLE; // 109
CONVOLVE_ONE_SAMPLE; // 110
CONVOLVE_ONE_SAMPLE; // 111
CONVOLVE_ONE_SAMPLE; // 112
CONVOLVE_ONE_SAMPLE; // 113
CONVOLVE_ONE_SAMPLE; // 114
CONVOLVE_ONE_SAMPLE; // 115
CONVOLVE_ONE_SAMPLE; // 116
CONVOLVE_ONE_SAMPLE; // 117
CONVOLVE_ONE_SAMPLE; // 118
CONVOLVE_ONE_SAMPLE; // 119
CONVOLVE_ONE_SAMPLE; // 120
CONVOLVE_ONE_SAMPLE; // 121
CONVOLVE_ONE_SAMPLE; // 122
CONVOLVE_ONE_SAMPLE; // 123
CONVOLVE_ONE_SAMPLE; // 124
CONVOLVE_ONE_SAMPLE; // 125
CONVOLVE_ONE_SAMPLE; // 126
CONVOLVE_ONE_SAMPLE; // 127
CONVOLVE_ONE_SAMPLE; // 128
} else {
while (j < kernelSize) {
// Non-optimized using actual while loop.
CONVOLVE_ONE_SAMPLE;
}
}
destP[i++] = sum;
}
#if CPU(X86) || CPU(X86_64)
}
#endif
#endif // OS(MACOSX)
// Copy 2nd half of input buffer to 1st half.
memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess);
}
void DirectConvolver::reset() {
m_buffer.zero();
}
} // namespace blink