skia/ext/convolver_neon.cc - chromium/src - Git at Google

 /*
  * Copyright 2016 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #include "skia/ext/convolver_neon.h"

 #include <arm_neon.h>

 namespace skia {

 static SK_ALWAYS_INLINE int32x4_t
 AccumRemainder(const unsigned char* pixels_left,
                const ConvolutionFilter1D::Fixed* filter_values,
                int r) {
   int remainder[4] = {0, 0, 0, 0};
   for (int i = 0; i < r; i++) {
     ConvolutionFilter1D::Fixed coeff = filter_values[i];
     remainder[0] += coeff * pixels_left[i * 4 + 0];
     remainder[1] += coeff * pixels_left[i * 4 + 1];
     remainder[2] += coeff * pixels_left[i * 4 + 2];
     remainder[3] += coeff * pixels_left[i * 4 + 3];
   }
   return vld1q_s32(remainder);
 }

 // Convolves horizontally along a single row. The row data is given in
 // |src_data| and continues for the num_values() of the filter.
 void ConvolveHorizontally_Neon(const unsigned char* src_data,
                                const ConvolutionFilter1D& filter,
                                unsigned char* out_row,
                                bool /*has_alpha*/) {
   // Loop over each pixel on this row in the output image.
   int num_values = filter.num_values();
   for (int out_x = 0; out_x < num_values; out_x++) {
     // Get the filter that determines the current output pixel.
     int filter_offset, filter_length;
     const ConvolutionFilter1D::Fixed* filter_values =
         filter.FilterForValue(out_x, &filter_offset, &filter_length);

     // Compute the first pixel in this row that the filter affects. It will
     // touch |filter_length| pixels (4 bytes each) after this.
     const unsigned char* row_to_filter = &src_data[filter_offset * 4];

     // Apply the filter to the row to get the destination pixel in |accum|.
     int32x4_t accum = vdupq_n_s32(0);
     for (int filter_x = 0; filter_x < (filter_length / 4); filter_x++) {
       // Load 4 coefficients.
       int16x4_t coeffs = vld1_s16(filter_values);
       // Load 4 pixels into a q-register.
       uint8x16_t pixels = vld1q_u8(row_to_filter);

       // Expand to 16-bit channels split across two q-registers.
       int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
       int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));

       // Scale each pixel (each d-register) by its filter coefficients,
       // accumulating into 32-bit.
       accum = vmlal_lane_s16(accum, vget_low_s16(p01_16), coeffs, 0);
       accum = vmlal_lane_s16(accum, vget_high_s16(p01_16), coeffs, 1);
       accum = vmlal_lane_s16(accum, vget_low_s16(p23_16), coeffs, 2);
       accum = vmlal_lane_s16(accum, vget_high_s16(p23_16), coeffs, 3);

       // Advance to next elements.
       row_to_filter += 16;
       filter_values += 4;
     }

     int remainder = filter_length & 3;
     if (remainder) {
       int remainder_offset = (filter_offset + filter_length - remainder) * 4;
       accum +=
           AccumRemainder(src_data + remainder_offset, filter_values, remainder);
     }

     // Bring this value back in range. All of the filter scaling factors
     // are in fixed point with kShiftBits bits of fractional part.
     int16x4_t accum16 = vqshrn_n_s32(accum, ConvolutionFilter1D::kShiftBits);

     // Pack and store the new pixel.
     uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
     vst1_lane_u32(reinterpret_cast<uint32_t*>(out_row),
                   vreinterpret_u32_u8(accum8), 0);
     out_row += 4;
   }
 }

 // Convolves horizontally along four rows. The row data is given in
 // |src_data| and continues for the num_values() of the filter.
 // The algorithm is almost same as |convolve_horizontally|. Please
 // refer to that function for detailed comments.
 void Convolve4RowsHorizontally_Neon(const unsigned char* src_data[4],
                                     const ConvolutionFilter1D& filter,
                                     unsigned char* out_row[4]) {
   // Output one pixel each iteration, calculating all channels (RGBA) together.
   int num_values = filter.num_values();
   for (int out_x = 0; out_x < num_values; out_x++) {
     int filter_offset, filter_length;
     const ConvolutionFilter1D::Fixed* filter_values =
         filter.FilterForValue(out_x, &filter_offset, &filter_length);

     // Four pixels in a column per iteration.
     int32x4_t accum0 = vdupq_n_s32(0);
     int32x4_t accum1 = vdupq_n_s32(0);
     int32x4_t accum2 = vdupq_n_s32(0);
     int32x4_t accum3 = vdupq_n_s32(0);

     int start = filter_offset * 4;

     // Load and accumulate with four coefficients per iteration.
     for (int filter_x = 0; filter_x < (filter_length / 4); filter_x++) {
       // Load 4 coefficients.
       int16x4_t coeffs = vld1_s16(filter_values);

       auto iteration = [=](const uint8_t* src) {
         // c.f. ConvolveHorizontally_Neon() above.
         uint8x16_t pixels = vld1q_u8(src);
         int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
         int16x8_t p23_16 =
             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
         int32x4_t accum = vdupq_n_s32(0);
         accum = vmlal_lane_s16(accum, vget_low_s16(p01_16), coeffs, 0);
         accum = vmlal_lane_s16(accum, vget_high_s16(p01_16), coeffs, 1);
         accum = vmlal_lane_s16(accum, vget_low_s16(p23_16), coeffs, 2);
         accum = vmlal_lane_s16(accum, vget_high_s16(p23_16), coeffs, 3);
         return accum;
       };

       accum0 += iteration(src_data[0] + start);
       accum1 += iteration(src_data[1] + start);
       accum2 += iteration(src_data[2] + start);
       accum3 += iteration(src_data[3] + start);

       start += 16;
       filter_values += 4;
     }

     int remainder = filter_length & 3;
     if (remainder) {
       int remainder_offset = (filter_offset + filter_length - remainder) * 4;
       accum0 += AccumRemainder(src_data[0] + remainder_offset, filter_values,
                                remainder);
       accum1 += AccumRemainder(src_data[1] + remainder_offset, filter_values,
                                remainder);
       accum2 += AccumRemainder(src_data[2] + remainder_offset, filter_values,
                                remainder);
       accum3 += AccumRemainder(src_data[3] + remainder_offset, filter_values,
                                remainder);
     }

     auto pack_result = [](int32x4_t accum) {
       int16x4_t accum16 = vqshrn_n_s32(accum, ConvolutionFilter1D::kShiftBits);
       return vqmovun_s16(vcombine_s16(accum16, accum16));
     };

     uint8x8_t res0 = pack_result(accum0);
     uint8x8_t res1 = pack_result(accum1);
     uint8x8_t res2 = pack_result(accum2);
     uint8x8_t res3 = pack_result(accum3);

     vst1_lane_u32(reinterpret_cast<uint32_t*>(out_row[0]),
                   vreinterpret_u32_u8(res0), 0);
     vst1_lane_u32(reinterpret_cast<uint32_t*>(out_row[1]),
                   vreinterpret_u32_u8(res1), 0);
     vst1_lane_u32(reinterpret_cast<uint32_t*>(out_row[2]),
                   vreinterpret_u32_u8(res2), 0);
     vst1_lane_u32(reinterpret_cast<uint32_t*>(out_row[3]),
                   vreinterpret_u32_u8(res3), 0);
     out_row[0] += 4;
     out_row[1] += 4;
     out_row[2] += 4;
     out_row[3] += 4;
   }
 }

 // Does vertical convolution to produce one output row. The filter values and
 // length are given in the first two parameters. These are applied to each
 // of the rows pointed to in the |source_data_rows| array, with each row
 // being |pixel_width| wide.
 //
 // The output must have room for |pixel_width * 4| bytes.
 void ConvolveVertically_Neon(const ConvolutionFilter1D::Fixed* filter_values,
                              int filter_length,
                              unsigned char* const* source_data_rows,
                              int pixel_width,
                              unsigned char* out_row,
                              bool has_alpha) {
   int width = pixel_width & ~3;

   // Output four pixels per iteration (16 bytes).
   for (int out_x = 0; out_x < width; out_x += 4) {
     // Accumulated result for each pixel. 32 bits per RGBA channel.
     int32x4_t accum0 = vdupq_n_s32(0);
     int32x4_t accum1 = vdupq_n_s32(0);
     int32x4_t accum2 = vdupq_n_s32(0);
     int32x4_t accum3 = vdupq_n_s32(0);

     // Convolve with one filter coefficient per iteration.
     for (int filter_y = 0; filter_y < filter_length; filter_y++) {
       // Load four pixels (16 bytes) together.
       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
       uint8x16_t src8 = vld1q_u8(&source_data_rows[filter_y][out_x << 2]);

       int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
       int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));

       accum0 =
           vmlal_n_s16(accum0, vget_low_s16(src16_01), filter_values[filter_y]);
       accum1 =
           vmlal_n_s16(accum1, vget_high_s16(src16_01), filter_values[filter_y]);
       accum2 =
           vmlal_n_s16(accum2, vget_low_s16(src16_23), filter_values[filter_y]);
       accum3 =
           vmlal_n_s16(accum3, vget_high_s16(src16_23), filter_values[filter_y]);
     }

     // Shift right for fixed point implementation.
     // Packing 32 bits |accum| to 16 bits per channel (unsigned saturation).
     int16x4_t accum16_0 = vqshrn_n_s32(accum0, ConvolutionFilter1D::kShiftBits);
     int16x4_t accum16_1 = vqshrn_n_s32(accum1, ConvolutionFilter1D::kShiftBits);
     int16x4_t accum16_2 = vqshrn_n_s32(accum2, ConvolutionFilter1D::kShiftBits);
     int16x4_t accum16_3 = vqshrn_n_s32(accum3, ConvolutionFilter1D::kShiftBits);

     // [16] a1 b1 g1 r1 a0 b0 g0 r0
     int16x8_t accum16_low = vcombine_s16(accum16_0, accum16_1);
     // [16] a3 b3 g3 r3 a2 b2 g2 r2
     int16x8_t accum16_high = vcombine_s16(accum16_2, accum16_3);

     // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
     // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     uint8x16_t accum8 =
         vcombine_u8(vqmovun_s16(accum16_low), vqmovun_s16(accum16_high));

     if (has_alpha) {
       // Compute the max(ri, gi, bi) for each pixel.
       // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
       uint8x16_t a =
           vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
       uint8x16_t b = vmaxq_u8(a, accum8);  // Max of r and g
       // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
       a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
       b = vmaxq_u8(a, b);  // Max of r and g and b.
       // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
       b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));

       // Make sure the value of alpha channel is always larger than maximum
       // value of color channels.
       accum8 = vmaxq_u8(b, accum8);
     } else {
       // Set value of alpha channels to 0xFF.
       accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) |
                                     vdupq_n_u32(0xFF000000));
     }

     // Store the convolution result (16 bytes) and advance the pixel pointers.
     vst1q_u8(out_row, accum8);
     out_row += 16;
   }

   // Process the leftovers when the width of the output is not divisible
   // by 4, that is at most 3 pixels.
   int remainder = pixel_width & 3;
   if (remainder) {
     int32x4_t accum0 = vdupq_n_s32(0);
     int32x4_t accum1 = vdupq_n_s32(0);
     int32x4_t accum2 = vdupq_n_s32(0);

     for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
       uint8x16_t src8 = vld1q_u8(&source_data_rows[filter_y][width * 4]);

       int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
       int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));

       accum0 =
           vmlal_n_s16(accum0, vget_low_s16(src16_01), filter_values[filter_y]);
       accum1 =
           vmlal_n_s16(accum1, vget_high_s16(src16_01), filter_values[filter_y]);
       accum2 =
           vmlal_n_s16(accum2, vget_low_s16(src16_23), filter_values[filter_y]);
     }

     int16x4_t accum16_0 = vqshrn_n_s32(accum0, ConvolutionFilter1D::kShiftBits);
     int16x4_t accum16_1 = vqshrn_n_s32(accum1, ConvolutionFilter1D::kShiftBits);
     int16x4_t accum16_2 = vqshrn_n_s32(accum2, ConvolutionFilter1D::kShiftBits);

     int16x8_t accum16_low = vcombine_s16(accum16_0, accum16_1);
     int16x8_t accum16_high = vcombine_s16(accum16_2, accum16_2);

     uint8x16_t accum8 =
         vcombine_u8(vqmovun_s16(accum16_low), vqmovun_s16(accum16_high));

     if (has_alpha) {
       // Compute the max(ri, gi, bi) for each pixel.
       // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
       uint8x16_t a =
           vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
       uint8x16_t b = vmaxq_u8(a, accum8);  // Max of r and g
       // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
       a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
       b = vmaxq_u8(a, b);  // Max of r and g and b.
       // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
       b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));

       // Make sure the value of alpha channel is always larger than maximum
       // value of color channels.
       accum8 = vmaxq_u8(b, accum8);
     } else {
       // Set value of alpha channels to 0xFF.
       accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) |
                                     vdupq_n_u32(0xFF000000));
     }

     switch (remainder) {
       case 1:
         vst1q_lane_u32(reinterpret_cast<uint32_t*>(out_row),
                        vreinterpretq_u32_u8(accum8), 0);
         break;
       case 2:
         vst1_u32(reinterpret_cast<uint32_t*>(out_row),
                  vreinterpret_u32_u8(vget_low_u8(accum8)));
         break;
       case 3:
         vst1_u32(reinterpret_cast<uint32_t*>(out_row),
                  vreinterpret_u32_u8(vget_low_u8(accum8)));
         vst1q_lane_u32(reinterpret_cast<uint32_t*>(out_row + 8),
                        vreinterpretq_u32_u8(accum8), 2);
         break;
     }
   }
 }

 }  // namespace skia
	/*
	* Copyright 2016 Google Inc.
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#include "skia/ext/convolver_neon.h"

	#include <arm_neon.h>

	namespace skia {

	static SK_ALWAYS_INLINE int32x4_t
	AccumRemainder(const unsigned char* pixels_left,
	const ConvolutionFilter1D::Fixed* filter_values,
	int r) {
	int remainder[4] = {0, 0, 0, 0};
	for (int i = 0; i < r; i++) {
	ConvolutionFilter1D::Fixed coeff = filter_values[i];
	remainder[0] += coeff * pixels_left[i * 4 + 0];
	remainder[1] += coeff * pixels_left[i * 4 + 1];
	remainder[2] += coeff * pixels_left[i * 4 + 2];
	remainder[3] += coeff * pixels_left[i * 4 + 3];
	}
	return vld1q_s32(remainder);
	}

	// Convolves horizontally along a single row. The row data is given in
	// \|src_data\| and continues for the num_values() of the filter.
	void ConvolveHorizontally_Neon(const unsigned char* src_data,
	const ConvolutionFilter1D& filter,
	unsigned char* out_row,
	bool /has_alpha/) {
	// Loop over each pixel on this row in the output image.
	int num_values = filter.num_values();
	for (int out_x = 0; out_x < num_values; out_x++) {
	// Get the filter that determines the current output pixel.
	int filter_offset, filter_length;
	const ConvolutionFilter1D::Fixed* filter_values =
	filter.FilterForValue(out_x, &filter_offset, &filter_length);

	// Compute the first pixel in this row that the filter affects. It will
	// touch \|filter_length\| pixels (4 bytes each) after this.
	const unsigned char* row_to_filter = &src_data[filter_offset * 4];

	// Apply the filter to the row to get the destination pixel in \|accum\|.
	int32x4_t accum = vdupq_n_s32(0);
	for (int filter_x = 0; filter_x < (filter_length / 4); filter_x++) {
	// Load 4 coefficients.
	int16x4_t coeffs = vld1_s16(filter_values);
	// Load 4 pixels into a q-register.
	uint8x16_t pixels = vld1q_u8(row_to_filter);

	// Expand to 16-bit channels split across two q-registers.
	int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
	int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));

	// Scale each pixel (each d-register) by its filter coefficients,
	// accumulating into 32-bit.
	accum = vmlal_lane_s16(accum, vget_low_s16(p01_16), coeffs, 0);
	accum = vmlal_lane_s16(accum, vget_high_s16(p01_16), coeffs, 1);
	accum = vmlal_lane_s16(accum, vget_low_s16(p23_16), coeffs, 2);
	accum = vmlal_lane_s16(accum, vget_high_s16(p23_16), coeffs, 3);

	// Advance to next elements.
	row_to_filter += 16;
	filter_values += 4;
	}

	int remainder = filter_length & 3;
	if (remainder) {
	int remainder_offset = (filter_offset + filter_length - remainder) * 4;
	accum +=
	AccumRemainder(src_data + remainder_offset, filter_values, remainder);
	}

	// Bring this value back in range. All of the filter scaling factors
	// are in fixed point with kShiftBits bits of fractional part.
	int16x4_t accum16 = vqshrn_n_s32(accum, ConvolutionFilter1D::kShiftBits);

	// Pack and store the new pixel.
	uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
	vst1_lane_u32(reinterpret_cast<uint32_t*>(out_row),
	vreinterpret_u32_u8(accum8), 0);
	out_row += 4;
	}
	}

	// Convolves horizontally along four rows. The row data is given in
	// \|src_data\| and continues for the num_values() of the filter.
	// The algorithm is almost same as \|convolve_horizontally\|. Please
	// refer to that function for detailed comments.
	void Convolve4RowsHorizontally_Neon(const unsigned char* src_data[4],
	const ConvolutionFilter1D& filter,
	unsigned char* out_row[4]) {
	// Output one pixel each iteration, calculating all channels (RGBA) together.
	int num_values = filter.num_values();
	for (int out_x = 0; out_x < num_values; out_x++) {
	int filter_offset, filter_length;
	const ConvolutionFilter1D::Fixed* filter_values =
	filter.FilterForValue(out_x, &filter_offset, &filter_length);

	// Four pixels in a column per iteration.
	int32x4_t accum0 = vdupq_n_s32(0);
	int32x4_t accum1 = vdupq_n_s32(0);
	int32x4_t accum2 = vdupq_n_s32(0);
	int32x4_t accum3 = vdupq_n_s32(0);

	int start = filter_offset * 4;

	// Load and accumulate with four coefficients per iteration.
	for (int filter_x = 0; filter_x < (filter_length / 4); filter_x++) {
	// Load 4 coefficients.
	int16x4_t coeffs = vld1_s16(filter_values);

	auto iteration = [=](const uint8_t* src) {
	// c.f. ConvolveHorizontally_Neon() above.
	uint8x16_t pixels = vld1q_u8(src);
	int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
	int16x8_t p23_16 =
	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
	int32x4_t accum = vdupq_n_s32(0);
	accum = vmlal_lane_s16(accum, vget_low_s16(p01_16), coeffs, 0);
	accum = vmlal_lane_s16(accum, vget_high_s16(p01_16), coeffs, 1);
	accum = vmlal_lane_s16(accum, vget_low_s16(p23_16), coeffs, 2);
	accum = vmlal_lane_s16(accum, vget_high_s16(p23_16), coeffs, 3);
	return accum;
	};

	accum0 += iteration(src_data[0] + start);
	accum1 += iteration(src_data[1] + start);
	accum2 += iteration(src_data[2] + start);
	accum3 += iteration(src_data[3] + start);

	start += 16;
	filter_values += 4;
	}

	int remainder = filter_length & 3;
	if (remainder) {
	int remainder_offset = (filter_offset + filter_length - remainder) * 4;
	accum0 += AccumRemainder(src_data[0] + remainder_offset, filter_values,
	remainder);
	accum1 += AccumRemainder(src_data[1] + remainder_offset, filter_values,
	remainder);
	accum2 += AccumRemainder(src_data[2] + remainder_offset, filter_values,
	remainder);
	accum3 += AccumRemainder(src_data[3] + remainder_offset, filter_values,
	remainder);
	}

	auto pack_result = [](int32x4_t accum) {
	int16x4_t accum16 = vqshrn_n_s32(accum, ConvolutionFilter1D::kShiftBits);
	return vqmovun_s16(vcombine_s16(accum16, accum16));
	};

	uint8x8_t res0 = pack_result(accum0);
	uint8x8_t res1 = pack_result(accum1);
	uint8x8_t res2 = pack_result(accum2);
	uint8x8_t res3 = pack_result(accum3);

	vst1_lane_u32(reinterpret_cast<uint32_t*>(out_row[0]),
	vreinterpret_u32_u8(res0), 0);
	vst1_lane_u32(reinterpret_cast<uint32_t*>(out_row[1]),
	vreinterpret_u32_u8(res1), 0);
	vst1_lane_u32(reinterpret_cast<uint32_t*>(out_row[2]),
	vreinterpret_u32_u8(res2), 0);
	vst1_lane_u32(reinterpret_cast<uint32_t*>(out_row[3]),
	vreinterpret_u32_u8(res3), 0);
	out_row[0] += 4;
	out_row[1] += 4;
	out_row[2] += 4;
	out_row[3] += 4;
	}
	}

	// Does vertical convolution to produce one output row. The filter values and
	// length are given in the first two parameters. These are applied to each
	// of the rows pointed to in the \|source_data_rows\| array, with each row
	// being \|pixel_width\| wide.
	//
	// The output must have room for \|pixel_width * 4\| bytes.
	void ConvolveVertically_Neon(const ConvolutionFilter1D::Fixed* filter_values,
	int filter_length,
	unsigned char* const* source_data_rows,
	int pixel_width,
	unsigned char* out_row,
	bool has_alpha) {
	int width = pixel_width & ~3;

	// Output four pixels per iteration (16 bytes).
	for (int out_x = 0; out_x < width; out_x += 4) {
	// Accumulated result for each pixel. 32 bits per RGBA channel.
	int32x4_t accum0 = vdupq_n_s32(0);
	int32x4_t accum1 = vdupq_n_s32(0);
	int32x4_t accum2 = vdupq_n_s32(0);
	int32x4_t accum3 = vdupq_n_s32(0);

	// Convolve with one filter coefficient per iteration.
	for (int filter_y = 0; filter_y < filter_length; filter_y++) {
	// Load four pixels (16 bytes) together.
	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
	uint8x16_t src8 = vld1q_u8(&source_data_rows[filter_y][out_x << 2]);

	int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
	int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));

	accum0 =
	vmlal_n_s16(accum0, vget_low_s16(src16_01), filter_values[filter_y]);
	accum1 =
	vmlal_n_s16(accum1, vget_high_s16(src16_01), filter_values[filter_y]);
	accum2 =
	vmlal_n_s16(accum2, vget_low_s16(src16_23), filter_values[filter_y]);
	accum3 =
	vmlal_n_s16(accum3, vget_high_s16(src16_23), filter_values[filter_y]);
	}

	// Shift right for fixed point implementation.
	// Packing 32 bits \|accum\| to 16 bits per channel (unsigned saturation).
	int16x4_t accum16_0 = vqshrn_n_s32(accum0, ConvolutionFilter1D::kShiftBits);
	int16x4_t accum16_1 = vqshrn_n_s32(accum1, ConvolutionFilter1D::kShiftBits);
	int16x4_t accum16_2 = vqshrn_n_s32(accum2, ConvolutionFilter1D::kShiftBits);
	int16x4_t accum16_3 = vqshrn_n_s32(accum3, ConvolutionFilter1D::kShiftBits);

	// [16] a1 b1 g1 r1 a0 b0 g0 r0
	int16x8_t accum16_low = vcombine_s16(accum16_0, accum16_1);
	// [16] a3 b3 g3 r3 a2 b2 g2 r2
	int16x8_t accum16_high = vcombine_s16(accum16_2, accum16_3);

	// Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).
	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
	uint8x16_t accum8 =
	vcombine_u8(vqmovun_s16(accum16_low), vqmovun_s16(accum16_high));

	if (has_alpha) {
	// Compute the max(ri, gi, bi) for each pixel.
	// [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
	uint8x16_t a =
	vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
	uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
	// [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
	a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
	b = vmaxq_u8(a, b); // Max of r and g and b.
	// [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
	b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));

	// Make sure the value of alpha channel is always larger than maximum
	// value of color channels.
	accum8 = vmaxq_u8(b, accum8);
	} else {
	// Set value of alpha channels to 0xFF.
	accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) \|
	vdupq_n_u32(0xFF000000));
	}

	// Store the convolution result (16 bytes) and advance the pixel pointers.
	vst1q_u8(out_row, accum8);
	out_row += 16;
	}

	// Process the leftovers when the width of the output is not divisible
	// by 4, that is at most 3 pixels.
	int remainder = pixel_width & 3;
	if (remainder) {
	int32x4_t accum0 = vdupq_n_s32(0);
	int32x4_t accum1 = vdupq_n_s32(0);
	int32x4_t accum2 = vdupq_n_s32(0);

	for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
	uint8x16_t src8 = vld1q_u8(&source_data_rows[filter_y][width * 4]);

	int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
	int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));

	accum0 =
	vmlal_n_s16(accum0, vget_low_s16(src16_01), filter_values[filter_y]);
	accum1 =
	vmlal_n_s16(accum1, vget_high_s16(src16_01), filter_values[filter_y]);
	accum2 =
	vmlal_n_s16(accum2, vget_low_s16(src16_23), filter_values[filter_y]);
	}

	int16x4_t accum16_0 = vqshrn_n_s32(accum0, ConvolutionFilter1D::kShiftBits);
	int16x4_t accum16_1 = vqshrn_n_s32(accum1, ConvolutionFilter1D::kShiftBits);
	int16x4_t accum16_2 = vqshrn_n_s32(accum2, ConvolutionFilter1D::kShiftBits);

	int16x8_t accum16_low = vcombine_s16(accum16_0, accum16_1);
	int16x8_t accum16_high = vcombine_s16(accum16_2, accum16_2);

	uint8x16_t accum8 =
	vcombine_u8(vqmovun_s16(accum16_low), vqmovun_s16(accum16_high));

	if (has_alpha) {
	// Compute the max(ri, gi, bi) for each pixel.
	// [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
	uint8x16_t a =
	vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
	uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
	// [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
	a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
	b = vmaxq_u8(a, b); // Max of r and g and b.
	// [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
	b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));

	// Make sure the value of alpha channel is always larger than maximum
	// value of color channels.
	accum8 = vmaxq_u8(b, accum8);
	} else {
	// Set value of alpha channels to 0xFF.
	accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) \|
	vdupq_n_u32(0xFF000000));
	}

	switch (remainder) {
	case 1:
	vst1q_lane_u32(reinterpret_cast<uint32_t*>(out_row),
	vreinterpretq_u32_u8(accum8), 0);
	break;
	case 2:
	vst1_u32(reinterpret_cast<uint32_t*>(out_row),
	vreinterpret_u32_u8(vget_low_u8(accum8)));
	break;
	case 3:
	vst1_u32(reinterpret_cast<uint32_t*>(out_row),
	vreinterpret_u32_u8(vget_low_u8(accum8)));
	vst1q_lane_u32(reinterpret_cast<uint32_t*>(out_row + 8),
	vreinterpretq_u32_u8(accum8), 2);
	break;
	}
	}
	}

	} // namespace skia