You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
4107 lines
127 KiB
4107 lines
127 KiB
// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_
|
|
#define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_
|
|
|
|
#ifdef GEMMLOWP_NEON_64
|
|
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
|
|
namespace gemmlowp {
|
|
namespace meta {
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 1,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 1, 1, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v1.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v2.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v3.8h, v2.8b, v1.8b\n"
|
|
"uadalp v0.4s, v3.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[multiplicative_offset]\n"
|
|
"dup v7.4s, %w[rounding_offset]\n"
|
|
"dup v8.4s, %w[shift]\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"mul v0.4s, v0.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"sshl v0.4s, v0.4s, v8.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.b}[0], [%x[result]], #1\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 2,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 1, 2, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v2.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v5.8h, v3.8b, v2.8b\n"
|
|
"umull v6.8h, v4.8b, v2.8b\n"
|
|
"uadalp v0.4s, v5.8h\n"
|
|
"uadalp v1.4s, v6.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[multiplicative_offset]\n"
|
|
"dup v7.4s, %w[rounding_offset]\n"
|
|
"dup v8.4s, %w[shift]\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"mul v0.4s, v0.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"sshl v0.4s, v0.4s, v8.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.h}[0], [%x[result]], #2\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 3,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 1, 3, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v3.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v7.8h, v4.8b, v3.8b\n"
|
|
"umull v8.8h, v5.8b, v3.8b\n"
|
|
"umull v9.8h, v6.8b, v3.8b\n"
|
|
"uadalp v0.4s, v7.8h\n"
|
|
"uadalp v1.4s, v8.8h\n"
|
|
"uadalp v2.4s, v9.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[multiplicative_offset]\n"
|
|
"dup v7.4s, %w[rounding_offset]\n"
|
|
"dup v8.4s, %w[shift]\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"mul v0.4s, v0.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"sshl v0.4s, v0.4s, v8.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.h}[0], [%x[result]], #2\n"
|
|
"st1 {v0.b}[2], [%x[result]], #1\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc",
|
|
"memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 4,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 1, 4, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v4.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v9.8h, v5.8b, v4.8b\n"
|
|
"umull v10.8h, v6.8b, v4.8b\n"
|
|
"umull v11.8h, v7.8b, v4.8b\n"
|
|
"umull v12.8h, v8.8b, v4.8b\n"
|
|
"uadalp v0.4s, v9.8h\n"
|
|
"uadalp v1.4s, v10.8h\n"
|
|
"uadalp v2.4s, v11.8h\n"
|
|
"uadalp v3.4s, v12.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[multiplicative_offset]\n"
|
|
"dup v7.4s, %w[rounding_offset]\n"
|
|
"dup v8.4s, %w[shift]\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"mul v0.4s, v0.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"sshl v0.4s, v0.4s, v8.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 5,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 1, 5, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
|
|
// General 1xM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v9.2s}, [%x[lhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v10.8h, v5.8b, v9.8b\n"
|
|
"umull v11.8h, v6.8b, v9.8b\n"
|
|
"umull v12.8h, v7.8b, v9.8b\n"
|
|
"umull v13.8h, v8.8b, v9.8b\n"
|
|
"ld1 {v5.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[rhs], #128]\n"
|
|
"uadalp v0.4s, v10.8h\n"
|
|
"uadalp v1.4s, v11.8h\n"
|
|
"uadalp v2.4s, v12.8h\n"
|
|
"uadalp v3.4s, v13.8h\n"
|
|
"umull v10.8h, v5.8b, v9.8b\n"
|
|
"uadalp v4.4s, v10.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v5.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n"
|
|
"dup v8.4s, %w[multiplicative_offset]\n"
|
|
"dup v9.4s, %w[rounding_offset]\n"
|
|
"dup v10.4s, %w[shift]\n"
|
|
"dup v5.4s, v5.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v4.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v4.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v1.4s, v1.4s, v5.4s\n"
|
|
"add v0.4s, v0.4s, v6.4s\n"
|
|
"add v1.4s, v1.4s, v7.4s\n"
|
|
"mul v0.4s, v0.4s, v8.4s\n"
|
|
"mul v1.4s, v1.4s, v8.4s\n"
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v1.4s, v1.4s, v9.4s\n"
|
|
"sshl v0.4s, v0.4s, v10.4s\n"
|
|
"sshl v1.4s, v1.4s, v10.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
"st1 {v0.b}[4], [%x[result]], #1\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 6,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 1, 6, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
|
|
// General 1xM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v10.2s}, [%x[lhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v11.8h, v6.8b, v10.8b\n"
|
|
"umull v12.8h, v7.8b, v10.8b\n"
|
|
"umull v13.8h, v8.8b, v10.8b\n"
|
|
"umull v14.8h, v9.8b, v10.8b\n"
|
|
"ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[rhs], #128]\n"
|
|
"uadalp v0.4s, v11.8h\n"
|
|
"uadalp v1.4s, v12.8h\n"
|
|
"uadalp v2.4s, v13.8h\n"
|
|
"uadalp v3.4s, v14.8h\n"
|
|
"umull v11.8h, v6.8b, v10.8b\n"
|
|
"umull v12.8h, v7.8b, v10.8b\n"
|
|
"uadalp v4.4s, v11.8h\n"
|
|
"uadalp v5.4s, v12.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v6.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n"
|
|
"dup v9.4s, %w[multiplicative_offset]\n"
|
|
"dup v10.4s, %w[rounding_offset]\n"
|
|
"dup v11.4s, %w[shift]\n"
|
|
"dup v6.4s, v6.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v4.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v6.4s\n"
|
|
"add v1.4s, v1.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v1.4s, v1.4s, v8.4s\n"
|
|
"mul v0.4s, v0.4s, v9.4s\n"
|
|
"mul v1.4s, v1.4s, v9.4s\n"
|
|
"add v0.4s, v0.4s, v10.4s\n"
|
|
"add v1.4s, v1.4s, v10.4s\n"
|
|
"sshl v0.4s, v0.4s, v11.4s\n"
|
|
"sshl v1.4s, v1.4s, v11.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
"st1 {v0.h}[2], [%x[result]], #2\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 7,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 1, 7, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
|
|
// General 1xM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v11.2s}, [%x[lhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v12.8h, v7.8b, v11.8b\n"
|
|
"umull v13.8h, v8.8b, v11.8b\n"
|
|
"umull v14.8h, v9.8b, v11.8b\n"
|
|
"umull v15.8h, v10.8b, v11.8b\n"
|
|
"ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n"
|
|
"prfm pldl1keep, [%x[rhs], #128]\n"
|
|
"uadalp v0.4s, v12.8h\n"
|
|
"uadalp v1.4s, v13.8h\n"
|
|
"uadalp v2.4s, v14.8h\n"
|
|
"uadalp v3.4s, v15.8h\n"
|
|
"umull v12.8h, v7.8b, v11.8b\n"
|
|
"umull v13.8h, v8.8b, v11.8b\n"
|
|
"umull v14.8h, v9.8b, v11.8b\n"
|
|
"uadalp v4.4s, v12.8h\n"
|
|
"uadalp v5.4s, v13.8h\n"
|
|
"uadalp v6.4s, v14.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v7.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n"
|
|
"dup v10.4s, %w[multiplicative_offset]\n"
|
|
"dup v11.4s, %w[rounding_offset]\n"
|
|
"dup v12.4s, %w[shift]\n"
|
|
"dup v7.4s, v7.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v6.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v6.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v1.4s, v1.4s, v7.4s\n"
|
|
"add v0.4s, v0.4s, v8.4s\n"
|
|
"add v1.4s, v1.4s, v9.4s\n"
|
|
"mul v0.4s, v0.4s, v10.4s\n"
|
|
"mul v1.4s, v1.4s, v10.4s\n"
|
|
"add v0.4s, v0.4s, v11.4s\n"
|
|
"add v1.4s, v1.4s, v11.4s\n"
|
|
"sshl v0.4s, v0.4s, v12.4s\n"
|
|
"sshl v1.4s, v1.4s, v12.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
"st1 {v0.h}[2], [%x[result]], #2\n"
|
|
"st1 {v0.b}[6], [%x[result]], #1\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 8,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 1, 8, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
"mov v7.16b, v4.16b\n"
|
|
|
|
// 1x8 lanes loop.
|
|
"1:"
|
|
|
|
"ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v8.2s}, [%x[lhs]], #8\n"
|
|
"umull v13.8h, v8.8b, v9.8b\n"
|
|
"umull v14.8h, v8.8b, v10.8b\n"
|
|
"umull v15.8h, v8.8b, v11.8b\n"
|
|
"umull v16.8h, v8.8b, v12.8b\n"
|
|
"ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
|
|
"uadalp v0.4s, v13.8h\n"
|
|
"uadalp v1.4s, v14.8h\n"
|
|
"uadalp v2.4s, v15.8h\n"
|
|
"uadalp v3.4s, v16.8h\n"
|
|
"prfm pldl1keep, [%x[rhs], #256]\n"
|
|
"umull v17.8h, v8.8b, v9.8b\n"
|
|
"umull v13.8h, v8.8b, v10.8b\n"
|
|
"umull v14.8h, v8.8b, v11.8b\n"
|
|
"umull v15.8h, v8.8b, v12.8b\n"
|
|
"prfm pldl1keep, [%x[lhs], #32]\n"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"uadalp v4.4s, v17.8h\n"
|
|
"uadalp v5.4s, v13.8h\n"
|
|
"uadalp v6.4s, v14.8h\n"
|
|
"uadalp v7.4s, v15.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v8.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n"
|
|
"dup v11.4s, %w[multiplicative_offset]\n"
|
|
"dup v12.4s, %w[rounding_offset]\n"
|
|
"dup v13.4s, %w[shift]\n"
|
|
"dup v8.4s, v8.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v7.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v6.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v8.4s\n"
|
|
"add v1.4s, v1.4s, v8.4s\n"
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v1.4s, v1.4s, v10.4s\n"
|
|
"mul v0.4s, v0.4s, v11.4s\n"
|
|
"mul v1.4s, v1.4s, v11.4s\n"
|
|
"add v0.4s, v0.4s, v12.4s\n"
|
|
"add v1.4s, v1.4s, v12.4s\n"
|
|
"sshl v0.4s, v0.4s, v13.4s\n"
|
|
"sshl v1.4s, v1.4s, v13.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 1,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 2, 1, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n"
|
|
"ld1 {v4.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v5.8h, v4.8b, v2.8b\n"
|
|
"umull v6.8h, v4.8b, v3.8b\n"
|
|
"uadalp v0.4s, v5.8h\n"
|
|
"uadalp v1.4s, v6.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[multiplicative_offset]\n"
|
|
"dup v7.4s, %w[rounding_offset]\n"
|
|
"dup v8.4s, %w[shift]\n"
|
|
"dup v2.4s, v4.s[0]\n"
|
|
"dup v4.4s, v4.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v2.4s\n"
|
|
"add v1.4s, v1.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v1.4s, v1.4s, v5.4s\n"
|
|
"mul v0.4s, v0.4s, v6.4s\n"
|
|
"mul v1.4s, v1.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v1.4s, v1.4s, v7.4s\n"
|
|
"sshl v0.4s, v0.4s, v8.4s\n"
|
|
"sshl v1.4s, v1.4s, v8.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn v1.4h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun v1.8b, v1.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.b}[0], [%x[result]], #1\n"
|
|
"st1 {v1.b}[0], [x0], #1\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc",
|
|
"memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 2,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 2, 2, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n"
|
|
"ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v8.8h, v6.8b, v4.8b\n"
|
|
"umull v9.8h, v7.8b, v4.8b\n"
|
|
"umull v10.8h, v6.8b, v5.8b\n"
|
|
"umull v11.8h, v7.8b, v5.8b\n"
|
|
"uadalp v0.4s, v8.8h\n"
|
|
"uadalp v1.4s, v9.8h\n"
|
|
"uadalp v2.4s, v10.8h\n"
|
|
"uadalp v3.4s, v11.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[multiplicative_offset]\n"
|
|
"dup v7.4s, %w[rounding_offset]\n"
|
|
"dup v8.4s, %w[shift]\n"
|
|
"dup v9.4s, v4.s[0]\n"
|
|
"dup v4.4s, v4.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v2.4s, v2.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v2.4s, v2.4s, v5.4s\n"
|
|
"mul v0.4s, v0.4s, v6.4s\n"
|
|
"mul v2.4s, v2.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v2.4s, v2.4s, v7.4s\n"
|
|
"sshl v0.4s, v0.4s, v8.4s\n"
|
|
"sshl v2.4s, v2.4s, v8.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun v2.8b, v2.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.h}[0], [%x[result]], #2\n"
|
|
"st1 {v2.h}[0], [x0], #2\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 3,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 2, 3, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n"
|
|
"ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v11.8h, v8.8b, v6.8b\n"
|
|
"umull v12.8h, v9.8b, v6.8b\n"
|
|
"umull v13.8h, v10.8b, v6.8b\n"
|
|
"umull v14.8h, v8.8b, v7.8b\n"
|
|
"umull v15.8h, v9.8b, v7.8b\n"
|
|
"umull v16.8h, v10.8b, v7.8b\n"
|
|
"uadalp v0.4s, v11.8h\n"
|
|
"uadalp v1.4s, v12.8h\n"
|
|
"uadalp v2.4s, v13.8h\n"
|
|
"uadalp v3.4s, v14.8h\n"
|
|
"uadalp v4.4s, v15.8h\n"
|
|
"uadalp v5.4s, v16.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v6.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v7.4s}, [%x[rhs]], #16\n"
|
|
"dup v8.4s, %w[multiplicative_offset]\n"
|
|
"dup v9.4s, %w[rounding_offset]\n"
|
|
"dup v10.4s, %w[shift]\n"
|
|
"dup v11.4s, v6.s[0]\n"
|
|
"dup v6.4s, v6.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v3.4s, v3.4s, v4.4s\n"
|
|
"addp v5.4s, v5.4s, v5.4s\n"
|
|
"addp v3.4s, v3.4s, v5.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v11.4s\n"
|
|
"add v3.4s, v3.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v3.4s, v3.4s, v7.4s\n"
|
|
"mul v0.4s, v0.4s, v8.4s\n"
|
|
"mul v3.4s, v3.4s, v8.4s\n"
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v3.4s, v3.4s, v9.4s\n"
|
|
"sshl v0.4s, v0.4s, v10.4s\n"
|
|
"sshl v3.4s, v3.4s, v10.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn v3.4h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun v3.8b, v3.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.h}[0], [%x[result]], #2\n"
|
|
"st1 {v0.b}[2], [%x[result]], #1\n"
|
|
"st1 {v3.h}[0], [x0], #2\n"
|
|
"st1 {v3.b}[2], [x0], #1\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 4,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 2, 4, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
"mov v7.16b, v4.16b\n"
|
|
|
|
// 2x4 lanes loop.
|
|
"1:"
|
|
|
|
"ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n"
|
|
"ld1 {v8.8b}, [%x[lhs]], #8\n"
|
|
"umull v14.8h, v8.8b, v10.8b\n"
|
|
"ld1 {v9.8b}, [%x[lhs]], #8\n"
|
|
"umull v15.8h, v8.8b, v11.8b\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v16.8h, v8.8b, v12.8b\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v17.8h, v8.8b, v13.8b\n"
|
|
"umull v18.8h, v9.8b, v10.8b\n"
|
|
"uadalp v0.4s, v14.8h\n"
|
|
"uadalp v1.4s, v15.8h\n"
|
|
"uadalp v2.4s, v16.8h\n"
|
|
"umull v14.8h, v9.8b, v11.8b\n"
|
|
"umull v15.8h, v9.8b, v12.8b\n"
|
|
"umull v16.8h, v9.8b, v13.8b\n"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"uadalp v3.4s, v17.8h\n"
|
|
"uadalp v4.4s, v18.8h\n"
|
|
"uadalp v5.4s, v14.8h\n"
|
|
"uadalp v6.4s, v15.8h\n"
|
|
"uadalp v7.4s, v16.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v8.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v9.4s}, [%x[rhs]], #16\n"
|
|
"dup v10.4s, %w[multiplicative_offset]\n"
|
|
"dup v11.4s, %w[rounding_offset]\n"
|
|
"dup v12.4s, %w[shift]\n"
|
|
"dup v13.4s, v8.s[0]\n"
|
|
"dup v8.4s, v8.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v7.4s\n"
|
|
"addp v4.4s, v4.4s, v6.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v13.4s\n"
|
|
"add v4.4s, v4.4s, v8.4s\n"
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v4.4s, v4.4s, v9.4s\n"
|
|
"mul v0.4s, v0.4s, v10.4s\n"
|
|
"mul v4.4s, v4.4s, v10.4s\n"
|
|
"add v0.4s, v0.4s, v11.4s\n"
|
|
"add v4.4s, v4.4s, v11.4s\n"
|
|
"sshl v0.4s, v0.4s, v12.4s\n"
|
|
"sshl v4.4s, v4.4s, v12.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn v4.4h, v4.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun v4.8b, v4.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
"st1 {v4.s}[0], [x0], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 1,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 3, 1, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n"
|
|
"ld1 {v6.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v7.8h, v6.8b, v3.8b\n"
|
|
"umull v8.8h, v6.8b, v4.8b\n"
|
|
"umull v9.8h, v6.8b, v5.8b\n"
|
|
"uadalp v0.4s, v7.8h\n"
|
|
"uadalp v1.4s, v8.8h\n"
|
|
"uadalp v2.4s, v9.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[multiplicative_offset]\n"
|
|
"dup v7.4s, %w[rounding_offset]\n"
|
|
"dup v8.4s, %w[shift]\n"
|
|
"dup v3.4s, v4.s[0]\n"
|
|
"dup v9.4s, v4.s[1]\n"
|
|
"dup v4.4s, v4.s[2]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
"add x1, x0, %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v3.4s\n"
|
|
"add v1.4s, v1.4s, v9.4s\n"
|
|
"add v2.4s, v2.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v1.4s, v1.4s, v5.4s\n"
|
|
"add v2.4s, v2.4s, v5.4s\n"
|
|
"mul v0.4s, v0.4s, v6.4s\n"
|
|
"mul v1.4s, v1.4s, v6.4s\n"
|
|
"mul v2.4s, v2.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v1.4s, v1.4s, v7.4s\n"
|
|
"add v2.4s, v2.4s, v7.4s\n"
|
|
"sshl v0.4s, v0.4s, v8.4s\n"
|
|
"sshl v1.4s, v1.4s, v8.4s\n"
|
|
"sshl v2.4s, v2.4s, v8.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn v1.4h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun v1.8b, v1.8h\n"
|
|
"sqxtun v2.8b, v2.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.b}[0], [%x[result]], #1\n"
|
|
"st1 {v1.b}[0], [x0], #1\n"
|
|
"st1 {v2.b}[0], [x1], #1\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 2,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 3, 2, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n"
|
|
"ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v11.8h, v9.8b, v6.8b\n"
|
|
"umull v12.8h, v10.8b, v6.8b\n"
|
|
"umull v13.8h, v9.8b, v7.8b\n"
|
|
"umull v14.8h, v10.8b, v7.8b\n"
|
|
"umull v15.8h, v9.8b, v8.8b\n"
|
|
"umull v16.8h, v10.8b, v8.8b\n"
|
|
"uadalp v0.4s, v11.8h\n"
|
|
"uadalp v1.4s, v12.8h\n"
|
|
"uadalp v2.4s, v13.8h\n"
|
|
"uadalp v3.4s, v14.8h\n"
|
|
"uadalp v4.4s, v15.8h\n"
|
|
"uadalp v5.4s, v16.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v6.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v7.4s}, [%x[rhs]], #16\n"
|
|
"dup v8.4s, %w[multiplicative_offset]\n"
|
|
"dup v9.4s, %w[rounding_offset]\n"
|
|
"dup v10.4s, %w[shift]\n"
|
|
"dup v11.4s, v6.s[0]\n"
|
|
"dup v12.4s, v6.s[1]\n"
|
|
"dup v6.4s, v6.s[2]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
"add x1, x0, %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v4.4s, v4.4s, v4.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v11.4s\n"
|
|
"add v2.4s, v2.4s, v12.4s\n"
|
|
"add v4.4s, v4.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v2.4s, v2.4s, v7.4s\n"
|
|
"add v4.4s, v4.4s, v7.4s\n"
|
|
"mul v0.4s, v0.4s, v8.4s\n"
|
|
"mul v2.4s, v2.4s, v8.4s\n"
|
|
"mul v4.4s, v4.4s, v8.4s\n"
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v2.4s, v2.4s, v9.4s\n"
|
|
"add v4.4s, v4.4s, v9.4s\n"
|
|
"sshl v0.4s, v0.4s, v10.4s\n"
|
|
"sshl v2.4s, v2.4s, v10.4s\n"
|
|
"sshl v4.4s, v4.4s, v10.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn v4.4h, v4.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun v2.8b, v2.8h\n"
|
|
"sqxtun v4.8b, v4.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.h}[0], [%x[result]], #2\n"
|
|
"st1 {v2.h}[0], [x0], #2\n"
|
|
"st1 {v4.h}[0], [x1], #2\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void
|
|
MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 3,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessed,
|
|
RowMajor>& params,
|
|
uint8_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
|
|
"QuantizedStaticPreprocessed, RowMajor, 3, 3, 8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
"mov v7.16b, v4.16b\n"
|
|
"mov v8.16b, v5.16b\n"
|
|
|
|
// 3x3 lanes loop.
|
|
"1:"
|
|
|
|
"ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n"
|
|
"ld1 {v9.8b}, [%x[lhs]], #8\n"
|
|
"umull v15.8h, v9.8b, v12.8b\n"
|
|
"ld1 {v10.8b}, [%x[lhs]], #8\n"
|
|
"umull v16.8h, v9.8b, v13.8b\n"
|
|
"ld1 {v11.8b}, [%x[lhs]], #8\n"
|
|
"umull v17.8h, v9.8b, v14.8b\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v18.8h, v10.8b, v12.8b\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"uadalp v0.4s, v15.8h\n"
|
|
"uadalp v1.4s, v16.8h\n"
|
|
"uadalp v2.4s, v17.8h\n"
|
|
"uadalp v3.4s, v18.8h\n"
|
|
"umull v15.8h, v10.8b, v13.8b\n"
|
|
"umull v16.8h, v10.8b, v14.8b\n"
|
|
"umull v17.8h, v11.8b, v12.8b\n"
|
|
"umull v18.8h, v11.8b, v13.8b\n"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"umull v9.8h, v11.8b, v14.8b\n"
|
|
"uadalp v4.4s, v15.8h\n"
|
|
"uadalp v5.4s, v16.8h\n"
|
|
"uadalp v6.4s, v17.8h\n"
|
|
"uadalp v7.4s, v18.8h\n"
|
|
"uadalp v8.4s, v9.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantization::Prepare
|
|
"ld1 {v9.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v10.4s}, [%x[rhs]], #16\n"
|
|
"dup v11.4s, %w[multiplicative_offset]\n"
|
|
"dup v12.4s, %w[rounding_offset]\n"
|
|
"dup v13.4s, %w[shift]\n"
|
|
"dup v14.4s, v9.s[0]\n"
|
|
"dup v15.4s, v9.s[1]\n"
|
|
"dup v9.4s, v9.s[2]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
"add x1, x0, %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v3.4s, v3.4s, v4.4s\n"
|
|
"addp v5.4s, v5.4s, v5.4s\n"
|
|
"addp v3.4s, v3.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v7.4s\n"
|
|
"addp v8.4s, v8.4s, v8.4s\n"
|
|
"addp v6.4s, v6.4s, v8.4s\n"
|
|
|
|
// StaticQuantization::Transform
|
|
"add v0.4s, v0.4s, v14.4s\n"
|
|
"add v3.4s, v3.4s, v15.4s\n"
|
|
"add v6.4s, v6.4s, v9.4s\n"
|
|
"add v0.4s, v0.4s, v10.4s\n"
|
|
"add v3.4s, v3.4s, v10.4s\n"
|
|
"add v6.4s, v6.4s, v10.4s\n"
|
|
"mul v0.4s, v0.4s, v11.4s\n"
|
|
"mul v3.4s, v3.4s, v11.4s\n"
|
|
"mul v6.4s, v6.4s, v11.4s\n"
|
|
"add v0.4s, v0.4s, v12.4s\n"
|
|
"add v3.4s, v3.4s, v12.4s\n"
|
|
"add v6.4s, v6.4s, v12.4s\n"
|
|
"sshl v0.4s, v0.4s, v13.4s\n"
|
|
"sshl v3.4s, v3.4s, v13.4s\n"
|
|
"sshl v6.4s, v6.4s, v13.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn v3.4h, v3.4s\n"
|
|
"sqxtn v6.4h, v6.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun v3.8b, v3.8h\n"
|
|
"sqxtun v6.8b, v6.8h\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.h}[0], [%x[result]], #2\n"
|
|
"st1 {v0.b}[2], [%x[result]], #1\n"
|
|
"st1 {v3.h}[0], [x0], #2\n"
|
|
"st1 {v3.b}[2], [x0], #1\n"
|
|
"st1 {v6.h}[0], [x1], #2\n"
|
|
"st1 {v6.b}[2], [x1], #1\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[multiplicative_offset] "r"(params.kernel.multiplicative_offset),
|
|
[shift] "r"(params.kernel.shift),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[rounding_offset] "r"(params.kernel.rounding_offset)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc",
|
|
"memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v1.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v2.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v3.8h, v2.8b, v1.8b\n"
|
|
"uadalp v0.4s, v3.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v2.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v5.8h, v3.8b, v2.8b\n"
|
|
"umull v6.8h, v4.8b, v2.8b\n"
|
|
"uadalp v0.4s, v5.8h\n"
|
|
"uadalp v1.4s, v6.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v3.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v7.8h, v4.8b, v3.8b\n"
|
|
"umull v8.8h, v5.8b, v3.8b\n"
|
|
"umull v9.8h, v6.8b, v3.8b\n"
|
|
"uadalp v0.4s, v7.8h\n"
|
|
"uadalp v1.4s, v8.8h\n"
|
|
"uadalp v2.4s, v9.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
"st1 {v0.s}[2], [%x[result]], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc",
|
|
"memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v4.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v9.8h, v5.8b, v4.8b\n"
|
|
"umull v10.8h, v6.8b, v4.8b\n"
|
|
"umull v11.8h, v7.8b, v4.8b\n"
|
|
"umull v12.8h, v8.8b, v4.8b\n"
|
|
"uadalp v0.4s, v9.8h\n"
|
|
"uadalp v1.4s, v10.8h\n"
|
|
"uadalp v2.4s, v11.8h\n"
|
|
"uadalp v3.4s, v12.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s}, [%x[result]], #16\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
|
|
// General 1xM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v9.2s}, [%x[lhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v10.8h, v5.8b, v9.8b\n"
|
|
"umull v11.8h, v6.8b, v9.8b\n"
|
|
"umull v12.8h, v7.8b, v9.8b\n"
|
|
"umull v13.8h, v8.8b, v9.8b\n"
|
|
"ld1 {v5.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[rhs], #128]\n"
|
|
"uadalp v0.4s, v10.8h\n"
|
|
"uadalp v1.4s, v11.8h\n"
|
|
"uadalp v2.4s, v12.8h\n"
|
|
"uadalp v3.4s, v13.8h\n"
|
|
"umull v10.8h, v5.8b, v9.8b\n"
|
|
"uadalp v4.4s, v10.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v5.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n"
|
|
"dup v5.4s, v5.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v4.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v4.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v1.4s, v1.4s, v5.4s\n"
|
|
"add v0.4s, v0.4s, v6.4s\n"
|
|
"add v1.4s, v1.4s, v7.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s}, [%x[result]], #16\n"
|
|
"st1 {v1.s}[0], [%x[result]], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
|
|
// General 1xM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v10.2s}, [%x[lhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v11.8h, v6.8b, v10.8b\n"
|
|
"umull v12.8h, v7.8b, v10.8b\n"
|
|
"umull v13.8h, v8.8b, v10.8b\n"
|
|
"umull v14.8h, v9.8b, v10.8b\n"
|
|
"ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[rhs], #128]\n"
|
|
"uadalp v0.4s, v11.8h\n"
|
|
"uadalp v1.4s, v12.8h\n"
|
|
"uadalp v2.4s, v13.8h\n"
|
|
"uadalp v3.4s, v14.8h\n"
|
|
"umull v11.8h, v6.8b, v10.8b\n"
|
|
"umull v12.8h, v7.8b, v10.8b\n"
|
|
"uadalp v4.4s, v11.8h\n"
|
|
"uadalp v5.4s, v12.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v6.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n"
|
|
"dup v6.4s, v6.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v4.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v6.4s\n"
|
|
"add v1.4s, v1.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v1.4s, v1.4s, v8.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s}, [%x[result]], #16\n"
|
|
"st1 {v1.2s}, [%x[result]], #8\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
|
|
// General 1xM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v11.2s}, [%x[lhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v12.8h, v7.8b, v11.8b\n"
|
|
"umull v13.8h, v8.8b, v11.8b\n"
|
|
"umull v14.8h, v9.8b, v11.8b\n"
|
|
"umull v15.8h, v10.8b, v11.8b\n"
|
|
"ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n"
|
|
"prfm pldl1keep, [%x[rhs], #128]\n"
|
|
"uadalp v0.4s, v12.8h\n"
|
|
"uadalp v1.4s, v13.8h\n"
|
|
"uadalp v2.4s, v14.8h\n"
|
|
"uadalp v3.4s, v15.8h\n"
|
|
"umull v12.8h, v7.8b, v11.8b\n"
|
|
"umull v13.8h, v8.8b, v11.8b\n"
|
|
"umull v14.8h, v9.8b, v11.8b\n"
|
|
"uadalp v4.4s, v12.8h\n"
|
|
"uadalp v5.4s, v13.8h\n"
|
|
"uadalp v6.4s, v14.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v7.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n"
|
|
"dup v7.4s, v7.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v6.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v6.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v1.4s, v1.4s, v7.4s\n"
|
|
"add v0.4s, v0.4s, v8.4s\n"
|
|
"add v1.4s, v1.4s, v9.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s}, [%x[result]], #16\n"
|
|
"st1 {v1.2s}, [%x[result]], #8\n"
|
|
"st1 {v1.s}[2], [%x[result]], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
"mov v7.16b, v4.16b\n"
|
|
|
|
// 1x8 lanes loop.
|
|
"1:"
|
|
|
|
"ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v8.2s}, [%x[lhs]], #8\n"
|
|
"umull v13.8h, v8.8b, v9.8b\n"
|
|
"umull v14.8h, v8.8b, v10.8b\n"
|
|
"umull v15.8h, v8.8b, v11.8b\n"
|
|
"umull v16.8h, v8.8b, v12.8b\n"
|
|
"ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
|
|
"uadalp v0.4s, v13.8h\n"
|
|
"uadalp v1.4s, v14.8h\n"
|
|
"uadalp v2.4s, v15.8h\n"
|
|
"uadalp v3.4s, v16.8h\n"
|
|
"prfm pldl1keep, [%x[rhs], #256]\n"
|
|
"umull v17.8h, v8.8b, v9.8b\n"
|
|
"umull v13.8h, v8.8b, v10.8b\n"
|
|
"umull v14.8h, v8.8b, v11.8b\n"
|
|
"umull v15.8h, v8.8b, v12.8b\n"
|
|
"prfm pldl1keep, [%x[lhs], #32]\n"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"uadalp v4.4s, v17.8h\n"
|
|
"uadalp v5.4s, v13.8h\n"
|
|
"uadalp v6.4s, v14.8h\n"
|
|
"uadalp v7.4s, v15.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v8.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n"
|
|
"dup v8.4s, v8.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v7.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v6.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v8.4s\n"
|
|
"add v1.4s, v1.4s, v8.4s\n"
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v1.4s, v1.4s, v10.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s, v1.4s}, [%x[result]], #32\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n"
|
|
"ld1 {v4.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v5.8h, v4.8b, v2.8b\n"
|
|
"umull v6.8h, v4.8b, v3.8b\n"
|
|
"uadalp v0.4s, v5.8h\n"
|
|
"uadalp v1.4s, v6.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v2.4s, v4.s[0]\n"
|
|
"dup v4.4s, v4.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v2.4s\n"
|
|
"add v1.4s, v1.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v1.4s, v1.4s, v5.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
"st1 {v1.s}[0], [x0], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n"
|
|
"ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v8.8h, v6.8b, v4.8b\n"
|
|
"umull v9.8h, v7.8b, v4.8b\n"
|
|
"umull v10.8h, v6.8b, v5.8b\n"
|
|
"umull v11.8h, v7.8b, v5.8b\n"
|
|
"uadalp v0.4s, v8.8h\n"
|
|
"uadalp v1.4s, v9.8h\n"
|
|
"uadalp v2.4s, v10.8h\n"
|
|
"uadalp v3.4s, v11.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, v4.s[0]\n"
|
|
"dup v4.4s, v4.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v6.4s\n"
|
|
"add v2.4s, v2.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v2.4s, v2.4s, v5.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
"st1 {v2.2s}, [x0], #8\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n"
|
|
"ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v11.8h, v8.8b, v6.8b\n"
|
|
"umull v12.8h, v9.8b, v6.8b\n"
|
|
"umull v13.8h, v10.8b, v6.8b\n"
|
|
"umull v14.8h, v8.8b, v7.8b\n"
|
|
"umull v15.8h, v9.8b, v7.8b\n"
|
|
"umull v16.8h, v10.8b, v7.8b\n"
|
|
"uadalp v0.4s, v11.8h\n"
|
|
"uadalp v1.4s, v12.8h\n"
|
|
"uadalp v2.4s, v13.8h\n"
|
|
"uadalp v3.4s, v14.8h\n"
|
|
"uadalp v4.4s, v15.8h\n"
|
|
"uadalp v5.4s, v16.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v6.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v7.4s}, [%x[rhs]], #16\n"
|
|
"dup v8.4s, v6.s[0]\n"
|
|
"dup v6.4s, v6.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v3.4s, v3.4s, v4.4s\n"
|
|
"addp v5.4s, v5.4s, v5.4s\n"
|
|
"addp v3.4s, v3.4s, v5.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v8.4s\n"
|
|
"add v3.4s, v3.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v3.4s, v3.4s, v7.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
"st1 {v0.s}[2], [%x[result]], #4\n"
|
|
"st1 {v3.2s}, [x0], #8\n"
|
|
"st1 {v3.s}[2], [x0], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
"mov v7.16b, v4.16b\n"
|
|
|
|
// 2x4 lanes loop.
|
|
"1:"
|
|
|
|
"ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n"
|
|
"ld1 {v8.8b}, [%x[lhs]], #8\n"
|
|
"umull v14.8h, v8.8b, v10.8b\n"
|
|
"ld1 {v9.8b}, [%x[lhs]], #8\n"
|
|
"umull v15.8h, v8.8b, v11.8b\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v16.8h, v8.8b, v12.8b\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v17.8h, v8.8b, v13.8b\n"
|
|
"umull v18.8h, v9.8b, v10.8b\n"
|
|
"uadalp v0.4s, v14.8h\n"
|
|
"uadalp v1.4s, v15.8h\n"
|
|
"uadalp v2.4s, v16.8h\n"
|
|
"umull v14.8h, v9.8b, v11.8b\n"
|
|
"umull v15.8h, v9.8b, v12.8b\n"
|
|
"umull v16.8h, v9.8b, v13.8b\n"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"uadalp v3.4s, v17.8h\n"
|
|
"uadalp v4.4s, v18.8h\n"
|
|
"uadalp v5.4s, v14.8h\n"
|
|
"uadalp v6.4s, v15.8h\n"
|
|
"uadalp v7.4s, v16.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v8.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v9.4s}, [%x[rhs]], #16\n"
|
|
"dup v10.4s, v8.s[0]\n"
|
|
"dup v8.4s, v8.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v7.4s\n"
|
|
"addp v4.4s, v4.4s, v6.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v10.4s\n"
|
|
"add v4.4s, v4.4s, v8.4s\n"
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v4.4s, v4.4s, v9.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s}, [%x[result]], #16\n"
|
|
"st1 {v4.4s}, [x0], #16\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n"
|
|
"ld1 {v6.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v7.8h, v6.8b, v3.8b\n"
|
|
"umull v8.8h, v6.8b, v4.8b\n"
|
|
"umull v9.8h, v6.8b, v5.8b\n"
|
|
"uadalp v0.4s, v7.8h\n"
|
|
"uadalp v1.4s, v8.8h\n"
|
|
"uadalp v2.4s, v9.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v3.4s, v4.s[0]\n"
|
|
"dup v6.4s, v4.s[1]\n"
|
|
"dup v4.4s, v4.s[2]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
"add x1, x0, %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v3.4s\n"
|
|
"add v1.4s, v1.4s, v6.4s\n"
|
|
"add v2.4s, v2.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v1.4s, v1.4s, v5.4s\n"
|
|
"add v2.4s, v2.4s, v5.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
"st1 {v1.s}[0], [x0], #4\n"
|
|
"st1 {v2.s}[0], [x1], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n"
|
|
"ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v11.8h, v9.8b, v6.8b\n"
|
|
"umull v12.8h, v10.8b, v6.8b\n"
|
|
"umull v13.8h, v9.8b, v7.8b\n"
|
|
"umull v14.8h, v10.8b, v7.8b\n"
|
|
"umull v15.8h, v9.8b, v8.8b\n"
|
|
"umull v16.8h, v10.8b, v8.8b\n"
|
|
"uadalp v0.4s, v11.8h\n"
|
|
"uadalp v1.4s, v12.8h\n"
|
|
"uadalp v2.4s, v13.8h\n"
|
|
"uadalp v3.4s, v14.8h\n"
|
|
"uadalp v4.4s, v15.8h\n"
|
|
"uadalp v5.4s, v16.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v6.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v7.4s}, [%x[rhs]], #16\n"
|
|
"dup v8.4s, v6.s[0]\n"
|
|
"dup v9.4s, v6.s[1]\n"
|
|
"dup v6.4s, v6.s[2]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
"add x1, x0, %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v4.4s, v4.4s, v4.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v8.4s\n"
|
|
"add v2.4s, v2.4s, v9.4s\n"
|
|
"add v4.4s, v4.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v2.4s, v2.4s, v7.4s\n"
|
|
"add v4.4s, v4.4s, v7.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
"st1 {v2.2s}, [x0], #8\n"
|
|
"st1 {v4.2s}, [x1], #8\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
|
|
RowMajor>& params,
|
|
int32_t* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
|
|
"QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
"mov v7.16b, v4.16b\n"
|
|
"mov v8.16b, v5.16b\n"
|
|
|
|
// 3x3 lanes loop.
|
|
"1:"
|
|
|
|
"ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n"
|
|
"ld1 {v9.8b}, [%x[lhs]], #8\n"
|
|
"umull v15.8h, v9.8b, v12.8b\n"
|
|
"ld1 {v10.8b}, [%x[lhs]], #8\n"
|
|
"umull v16.8h, v9.8b, v13.8b\n"
|
|
"ld1 {v11.8b}, [%x[lhs]], #8\n"
|
|
"umull v17.8h, v9.8b, v14.8b\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v18.8h, v10.8b, v12.8b\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"uadalp v0.4s, v15.8h\n"
|
|
"uadalp v1.4s, v16.8h\n"
|
|
"uadalp v2.4s, v17.8h\n"
|
|
"uadalp v3.4s, v18.8h\n"
|
|
"umull v15.8h, v10.8b, v13.8b\n"
|
|
"umull v16.8h, v10.8b, v14.8b\n"
|
|
"umull v17.8h, v11.8b, v12.8b\n"
|
|
"umull v18.8h, v11.8b, v13.8b\n"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"umull v9.8h, v11.8b, v14.8b\n"
|
|
"uadalp v4.4s, v15.8h\n"
|
|
"uadalp v5.4s, v16.8h\n"
|
|
"uadalp v6.4s, v17.8h\n"
|
|
"uadalp v7.4s, v18.8h\n"
|
|
"uadalp v8.4s, v9.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationInt32::Prepare
|
|
"ld1 {v9.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v10.4s}, [%x[rhs]], #16\n"
|
|
"dup v11.4s, v9.s[0]\n"
|
|
"dup v12.4s, v9.s[1]\n"
|
|
"dup v9.4s, v9.s[2]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
"add x1, x0, %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v3.4s, v3.4s, v4.4s\n"
|
|
"addp v5.4s, v5.4s, v5.4s\n"
|
|
"addp v3.4s, v3.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v7.4s\n"
|
|
"addp v8.4s, v8.4s, v8.4s\n"
|
|
"addp v6.4s, v6.4s, v8.4s\n"
|
|
|
|
// StaticQuantizationInt32::Transform
|
|
"add v0.4s, v0.4s, v11.4s\n"
|
|
"add v3.4s, v3.4s, v12.4s\n"
|
|
"add v6.4s, v6.4s, v9.4s\n"
|
|
"add v0.4s, v0.4s, v10.4s\n"
|
|
"add v3.4s, v3.4s, v10.4s\n"
|
|
"add v6.4s, v6.4s, v10.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
"st1 {v0.s}[2], [%x[result]], #4\n"
|
|
"st1 {v3.2s}, [x0], #8\n"
|
|
"st1 {v3.s}[2], [x0], #4\n"
|
|
"st1 {v6.2s}, [x1], #8\n"
|
|
"st1 {v6.s}[2], [x1], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc",
|
|
"memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v1.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v2.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v3.8h, v2.8b, v1.8b\n"
|
|
"uadalp v0.4s, v3.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[scale]\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v2.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v5.8h, v3.8b, v2.8b\n"
|
|
"umull v6.8h, v4.8b, v2.8b\n"
|
|
"uadalp v0.4s, v5.8h\n"
|
|
"uadalp v1.4s, v6.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[scale]\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v3.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v7.8h, v4.8b, v3.8b\n"
|
|
"umull v8.8h, v5.8b, v3.8b\n"
|
|
"umull v9.8h, v6.8b, v3.8b\n"
|
|
"uadalp v0.4s, v7.8h\n"
|
|
"uadalp v1.4s, v8.8h\n"
|
|
"uadalp v2.4s, v9.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[scale]\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
"st1 {v0.s}[2], [%x[result]], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc",
|
|
"memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v4.2s}, [%x[lhs]], #8\n"
|
|
"ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v9.8h, v5.8b, v4.8b\n"
|
|
"umull v10.8h, v6.8b, v4.8b\n"
|
|
"umull v11.8h, v7.8b, v4.8b\n"
|
|
"umull v12.8h, v8.8b, v4.8b\n"
|
|
"uadalp v0.4s, v9.8h\n"
|
|
"uadalp v1.4s, v10.8h\n"
|
|
"uadalp v2.4s, v11.8h\n"
|
|
"uadalp v3.4s, v12.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[scale]\n"
|
|
"dup v4.4s, v4.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s}, [%x[result]], #16\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
|
|
// General 1xM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v9.2s}, [%x[lhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v10.8h, v5.8b, v9.8b\n"
|
|
"umull v11.8h, v6.8b, v9.8b\n"
|
|
"umull v12.8h, v7.8b, v9.8b\n"
|
|
"umull v13.8h, v8.8b, v9.8b\n"
|
|
"ld1 {v5.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[rhs], #128]\n"
|
|
"uadalp v0.4s, v10.8h\n"
|
|
"uadalp v1.4s, v11.8h\n"
|
|
"uadalp v2.4s, v12.8h\n"
|
|
"uadalp v3.4s, v13.8h\n"
|
|
"umull v10.8h, v5.8b, v9.8b\n"
|
|
"uadalp v4.4s, v10.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v5.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n"
|
|
"dup v8.4s, %w[scale]\n"
|
|
"dup v5.4s, v5.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v4.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v4.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v1.4s, v1.4s, v5.4s\n"
|
|
"add v0.4s, v0.4s, v6.4s\n"
|
|
"add v1.4s, v1.4s, v7.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s}, [%x[result]], #16\n"
|
|
"st1 {v1.s}[0], [%x[result]], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
|
|
// General 1xM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v10.2s}, [%x[lhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v11.8h, v6.8b, v10.8b\n"
|
|
"umull v12.8h, v7.8b, v10.8b\n"
|
|
"umull v13.8h, v8.8b, v10.8b\n"
|
|
"umull v14.8h, v9.8b, v10.8b\n"
|
|
"ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[rhs], #128]\n"
|
|
"uadalp v0.4s, v11.8h\n"
|
|
"uadalp v1.4s, v12.8h\n"
|
|
"uadalp v2.4s, v13.8h\n"
|
|
"uadalp v3.4s, v14.8h\n"
|
|
"umull v11.8h, v6.8b, v10.8b\n"
|
|
"umull v12.8h, v7.8b, v10.8b\n"
|
|
"uadalp v4.4s, v11.8h\n"
|
|
"uadalp v5.4s, v12.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v6.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n"
|
|
"dup v9.4s, %w[scale]\n"
|
|
"dup v6.4s, v6.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v4.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v6.4s\n"
|
|
"add v1.4s, v1.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v1.4s, v1.4s, v8.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s}, [%x[result]], #16\n"
|
|
"st1 {v1.2s}, [%x[result]], #8\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
|
|
// General 1xM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v11.2s}, [%x[lhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v12.8h, v7.8b, v11.8b\n"
|
|
"umull v13.8h, v8.8b, v11.8b\n"
|
|
"umull v14.8h, v9.8b, v11.8b\n"
|
|
"umull v15.8h, v10.8b, v11.8b\n"
|
|
"ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n"
|
|
"prfm pldl1keep, [%x[rhs], #128]\n"
|
|
"uadalp v0.4s, v12.8h\n"
|
|
"uadalp v1.4s, v13.8h\n"
|
|
"uadalp v2.4s, v14.8h\n"
|
|
"uadalp v3.4s, v15.8h\n"
|
|
"umull v12.8h, v7.8b, v11.8b\n"
|
|
"umull v13.8h, v8.8b, v11.8b\n"
|
|
"umull v14.8h, v9.8b, v11.8b\n"
|
|
"uadalp v4.4s, v12.8h\n"
|
|
"uadalp v5.4s, v13.8h\n"
|
|
"uadalp v6.4s, v14.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v7.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n"
|
|
"dup v10.4s, %w[scale]\n"
|
|
"dup v7.4s, v7.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v6.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v6.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v1.4s, v1.4s, v7.4s\n"
|
|
"add v0.4s, v0.4s, v8.4s\n"
|
|
"add v1.4s, v1.4s, v9.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fmul v0.4s, v0.4s, v10.4s\n"
|
|
"fmul v1.4s, v1.4s, v10.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s}, [%x[result]], #16\n"
|
|
"st1 {v1.2s}, [%x[result]], #8\n"
|
|
"st1 {v1.s}[2], [%x[result]], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
"mov v7.16b, v4.16b\n"
|
|
|
|
// 1x8 lanes loop.
|
|
"1:"
|
|
|
|
"ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
|
|
"ld1 {v8.2s}, [%x[lhs]], #8\n"
|
|
"umull v13.8h, v8.8b, v9.8b\n"
|
|
"umull v14.8h, v8.8b, v10.8b\n"
|
|
"umull v15.8h, v8.8b, v11.8b\n"
|
|
"umull v16.8h, v8.8b, v12.8b\n"
|
|
"ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
|
|
"uadalp v0.4s, v13.8h\n"
|
|
"uadalp v1.4s, v14.8h\n"
|
|
"uadalp v2.4s, v15.8h\n"
|
|
"uadalp v3.4s, v16.8h\n"
|
|
"prfm pldl1keep, [%x[rhs], #256]\n"
|
|
"umull v17.8h, v8.8b, v9.8b\n"
|
|
"umull v13.8h, v8.8b, v10.8b\n"
|
|
"umull v14.8h, v8.8b, v11.8b\n"
|
|
"umull v15.8h, v8.8b, v12.8b\n"
|
|
"prfm pldl1keep, [%x[lhs], #32]\n"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"uadalp v4.4s, v17.8h\n"
|
|
"uadalp v5.4s, v13.8h\n"
|
|
"uadalp v6.4s, v14.8h\n"
|
|
"uadalp v7.4s, v15.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v8.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n"
|
|
"dup v11.4s, %w[scale]\n"
|
|
"dup v8.4s, v8.s[0]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v7.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v1.4s, v4.4s, v6.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v8.4s\n"
|
|
"add v1.4s, v1.4s, v8.4s\n"
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v1.4s, v1.4s, v10.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fmul v0.4s, v0.4s, v11.4s\n"
|
|
"fmul v1.4s, v1.4s, v11.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s, v1.4s}, [%x[result]], #32\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n"
|
|
"ld1 {v4.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v5.8h, v4.8b, v2.8b\n"
|
|
"umull v6.8h, v4.8b, v3.8b\n"
|
|
"uadalp v0.4s, v5.8h\n"
|
|
"uadalp v1.4s, v6.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[scale]\n"
|
|
"dup v2.4s, v4.s[0]\n"
|
|
"dup v4.4s, v4.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v2.4s\n"
|
|
"add v1.4s, v1.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v1.4s, v1.4s, v5.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
"st1 {v1.s}[0], [x0], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n"
|
|
"ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v8.8h, v6.8b, v4.8b\n"
|
|
"umull v9.8h, v7.8b, v4.8b\n"
|
|
"umull v10.8h, v6.8b, v5.8b\n"
|
|
"umull v11.8h, v7.8b, v5.8b\n"
|
|
"uadalp v0.4s, v8.8h\n"
|
|
"uadalp v1.4s, v9.8h\n"
|
|
"uadalp v2.4s, v10.8h\n"
|
|
"uadalp v3.4s, v11.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[scale]\n"
|
|
"dup v7.4s, v4.s[0]\n"
|
|
"dup v4.4s, v4.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v2.4s, v2.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v2.4s, v2.4s, v5.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
"st1 {v2.2s}, [x0], #8\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n"
|
|
"ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v11.8h, v8.8b, v6.8b\n"
|
|
"umull v12.8h, v9.8b, v6.8b\n"
|
|
"umull v13.8h, v10.8b, v6.8b\n"
|
|
"umull v14.8h, v8.8b, v7.8b\n"
|
|
"umull v15.8h, v9.8b, v7.8b\n"
|
|
"umull v16.8h, v10.8b, v7.8b\n"
|
|
"uadalp v0.4s, v11.8h\n"
|
|
"uadalp v1.4s, v12.8h\n"
|
|
"uadalp v2.4s, v13.8h\n"
|
|
"uadalp v3.4s, v14.8h\n"
|
|
"uadalp v4.4s, v15.8h\n"
|
|
"uadalp v5.4s, v16.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v6.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v7.4s}, [%x[rhs]], #16\n"
|
|
"dup v8.4s, %w[scale]\n"
|
|
"dup v9.4s, v6.s[0]\n"
|
|
"dup v6.4s, v6.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v3.4s, v3.4s, v4.4s\n"
|
|
"addp v5.4s, v5.4s, v5.4s\n"
|
|
"addp v3.4s, v3.4s, v5.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v3.4s, v3.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v3.4s, v3.4s, v7.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
"st1 {v0.s}[2], [%x[result]], #4\n"
|
|
"st1 {v3.2s}, [x0], #8\n"
|
|
"st1 {v3.s}[2], [x0], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
"mov v7.16b, v4.16b\n"
|
|
|
|
// 2x4 lanes loop.
|
|
"1:"
|
|
|
|
"ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n"
|
|
"ld1 {v8.8b}, [%x[lhs]], #8\n"
|
|
"umull v14.8h, v8.8b, v10.8b\n"
|
|
"ld1 {v9.8b}, [%x[lhs]], #8\n"
|
|
"umull v15.8h, v8.8b, v11.8b\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v16.8h, v8.8b, v12.8b\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v17.8h, v8.8b, v13.8b\n"
|
|
"umull v18.8h, v9.8b, v10.8b\n"
|
|
"uadalp v0.4s, v14.8h\n"
|
|
"uadalp v1.4s, v15.8h\n"
|
|
"uadalp v2.4s, v16.8h\n"
|
|
"umull v14.8h, v9.8b, v11.8b\n"
|
|
"umull v15.8h, v9.8b, v12.8b\n"
|
|
"umull v16.8h, v9.8b, v13.8b\n"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"uadalp v3.4s, v17.8h\n"
|
|
"uadalp v4.4s, v18.8h\n"
|
|
"uadalp v5.4s, v14.8h\n"
|
|
"uadalp v6.4s, v15.8h\n"
|
|
"uadalp v7.4s, v16.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v8.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v9.4s}, [%x[rhs]], #16\n"
|
|
"dup v10.4s, %w[scale]\n"
|
|
"dup v11.4s, v8.s[0]\n"
|
|
"dup v8.4s, v8.s[1]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v7.4s\n"
|
|
"addp v4.4s, v4.4s, v6.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v11.4s\n"
|
|
"add v4.4s, v4.4s, v8.4s\n"
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v4.4s, v4.4s, v9.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v10.4s\n"
|
|
"fmul v4.4s, v4.4s, v10.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.4s}, [%x[result]], #16\n"
|
|
"st1 {v4.4s}, [x0], #16\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
|
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n"
|
|
"ld1 {v6.2s}, [%x[rhs]], #8\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v7.8h, v6.8b, v3.8b\n"
|
|
"umull v8.8h, v6.8b, v4.8b\n"
|
|
"umull v9.8h, v6.8b, v5.8b\n"
|
|
"uadalp v0.4s, v7.8h\n"
|
|
"uadalp v1.4s, v8.8h\n"
|
|
"uadalp v2.4s, v9.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v4.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v5.4s}, [%x[rhs]], #16\n"
|
|
"dup v6.4s, %w[scale]\n"
|
|
"dup v3.4s, v4.s[0]\n"
|
|
"dup v7.4s, v4.s[1]\n"
|
|
"dup v4.4s, v4.s[2]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
"add x1, x0, %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
"addp v1.4s, v1.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v3.4s\n"
|
|
"add v1.4s, v1.4s, v7.4s\n"
|
|
"add v2.4s, v2.4s, v4.4s\n"
|
|
"add v0.4s, v0.4s, v5.4s\n"
|
|
"add v1.4s, v1.4s, v5.4s\n"
|
|
"add v2.4s, v2.4s, v5.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.s}[0], [%x[result]], #4\n"
|
|
"st1 {v1.s}[0], [x0], #4\n"
|
|
"st1 {v2.s}[0], [x1], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
|
|
// General NxM lanes loop.
|
|
"1:"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n"
|
|
"ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"umull v11.8h, v9.8b, v6.8b\n"
|
|
"umull v12.8h, v10.8b, v6.8b\n"
|
|
"umull v13.8h, v9.8b, v7.8b\n"
|
|
"umull v14.8h, v10.8b, v7.8b\n"
|
|
"umull v15.8h, v9.8b, v8.8b\n"
|
|
"umull v16.8h, v10.8b, v8.8b\n"
|
|
"uadalp v0.4s, v11.8h\n"
|
|
"uadalp v1.4s, v12.8h\n"
|
|
"uadalp v2.4s, v13.8h\n"
|
|
"uadalp v3.4s, v14.8h\n"
|
|
"uadalp v4.4s, v15.8h\n"
|
|
"uadalp v5.4s, v16.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v6.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v7.4s}, [%x[rhs]], #16\n"
|
|
"dup v8.4s, %w[scale]\n"
|
|
"dup v9.4s, v6.s[0]\n"
|
|
"dup v10.4s, v6.s[1]\n"
|
|
"dup v6.4s, v6.s[2]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
"add x1, x0, %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v0.4s, v0.4s, v0.4s\n"
|
|
"addp v2.4s, v2.4s, v3.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v4.4s, v4.4s, v5.4s\n"
|
|
"addp v4.4s, v4.4s, v4.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v9.4s\n"
|
|
"add v2.4s, v2.4s, v10.4s\n"
|
|
"add v4.4s, v4.4s, v6.4s\n"
|
|
"add v0.4s, v0.4s, v7.4s\n"
|
|
"add v2.4s, v2.4s, v7.4s\n"
|
|
"add v4.4s, v4.4s, v7.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v4.4s, v4.4s, v8.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
"st1 {v2.2s}, [x0], #8\n"
|
|
"st1 {v4.2s}, [x1], #8\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void MulKernel<
|
|
uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3,
|
|
8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
|
|
const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
|
|
RowMajor>& params,
|
|
float* result) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
|
|
"QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, "
|
|
"8>::Multiply()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
asm volatile(
|
|
"prfm pldl1keep, [%x[lhs]]\n"
|
|
"prfm pldl1keep, [%x[rhs]]\n"
|
|
|
|
// Clear aggregators.
|
|
"movi v0.4s, #0\n"
|
|
"movi v1.4s, #0\n"
|
|
"movi v2.4s, #0\n"
|
|
"mov v3.16b, v0.16b\n"
|
|
"mov v4.16b, v1.16b\n"
|
|
"mov v5.16b, v2.16b\n"
|
|
"mov v6.16b, v3.16b\n"
|
|
"mov v7.16b, v4.16b\n"
|
|
"mov v8.16b, v5.16b\n"
|
|
|
|
// 3x3 lanes loop.
|
|
"1:"
|
|
|
|
"ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n"
|
|
"ld1 {v9.8b}, [%x[lhs]], #8\n"
|
|
"umull v15.8h, v9.8b, v12.8b\n"
|
|
"ld1 {v10.8b}, [%x[lhs]], #8\n"
|
|
"umull v16.8h, v9.8b, v13.8b\n"
|
|
"ld1 {v11.8b}, [%x[lhs]], #8\n"
|
|
"umull v17.8h, v9.8b, v14.8b\n"
|
|
"prfm pldl1keep, [%x[lhs], #64]\n"
|
|
"umull v18.8h, v10.8b, v12.8b\n"
|
|
"prfm pldl1keep, [%x[rhs], #64]\n"
|
|
"uadalp v0.4s, v15.8h\n"
|
|
"uadalp v1.4s, v16.8h\n"
|
|
"uadalp v2.4s, v17.8h\n"
|
|
"uadalp v3.4s, v18.8h\n"
|
|
"umull v15.8h, v10.8b, v13.8b\n"
|
|
"umull v16.8h, v10.8b, v14.8b\n"
|
|
"umull v17.8h, v11.8b, v12.8b\n"
|
|
"umull v18.8h, v11.8b, v13.8b\n"
|
|
|
|
// Subtract counter.
|
|
"subs %x[count], %x[count], #8\n"
|
|
|
|
"umull v9.8h, v11.8b, v14.8b\n"
|
|
"uadalp v4.4s, v15.8h\n"
|
|
"uadalp v5.4s, v16.8h\n"
|
|
"uadalp v6.4s, v17.8h\n"
|
|
"uadalp v7.4s, v18.8h\n"
|
|
"uadalp v8.4s, v9.8h\n"
|
|
|
|
// Loop break.
|
|
"bgt 1b\n"
|
|
|
|
// StaticQuantizationFloat::Prepare
|
|
"ld1 {v9.4s}, [%x[lhs]], #16\n"
|
|
"ld1 {v10.4s}, [%x[rhs]], #16\n"
|
|
"dup v11.4s, %w[scale]\n"
|
|
"dup v12.4s, v9.s[0]\n"
|
|
"dup v13.4s, v9.s[1]\n"
|
|
"dup v9.4s, v9.s[2]\n"
|
|
|
|
// RowMajorOutput::Prepare
|
|
"add x0, %x[result], %x[stride]\n"
|
|
"add x1, x0, %x[stride]\n"
|
|
|
|
// Reduce aggregators.
|
|
"addp v0.4s, v0.4s, v1.4s\n"
|
|
"addp v2.4s, v2.4s, v2.4s\n"
|
|
"addp v0.4s, v0.4s, v2.4s\n"
|
|
"addp v3.4s, v3.4s, v4.4s\n"
|
|
"addp v5.4s, v5.4s, v5.4s\n"
|
|
"addp v3.4s, v3.4s, v5.4s\n"
|
|
"addp v6.4s, v6.4s, v7.4s\n"
|
|
"addp v8.4s, v8.4s, v8.4s\n"
|
|
"addp v6.4s, v6.4s, v8.4s\n"
|
|
|
|
// StaticQuantizationFloat::Transform
|
|
"add v0.4s, v0.4s, v12.4s\n"
|
|
"add v3.4s, v3.4s, v13.4s\n"
|
|
"add v6.4s, v6.4s, v9.4s\n"
|
|
"add v0.4s, v0.4s, v10.4s\n"
|
|
"add v3.4s, v3.4s, v10.4s\n"
|
|
"add v6.4s, v6.4s, v10.4s\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v11.4s\n"
|
|
"fmul v3.4s, v3.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
|
|
// RowMajorOutput::Output
|
|
"st1 {v0.2s}, [%x[result]], #8\n"
|
|
"st1 {v0.s}[2], [%x[result]], #4\n"
|
|
"st1 {v3.2s}, [x0], #8\n"
|
|
"st1 {v3.s}[2], [x0], #4\n"
|
|
"st1 {v6.2s}, [x1], #8\n"
|
|
"st1 {v6.s}[2], [x1], #4\n"
|
|
: [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
|
|
: [count] "r"(params.kernel.count),
|
|
[stride] "r"(params.output_stream.stride),
|
|
[scale] "r"(params.kernel.scale)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc",
|
|
"memory");
|
|
}
|
|
|
|
} // namespace meta
|
|
} // namespace gemmlowp
|
|
|
|
#else
|
|
#warning "Meta gemm for arm64 requires: GEMMLOWP_NEON_64!"
|
|
#endif
|
|
|
|
#endif // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_
|