Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions bench/subgraph/fully-connected.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,16 +202,20 @@ BENCHMARK(QD8FullyConnected)
static void FullyConnectedArgs(benchmark::internal::Benchmark* b) {
b->ArgNames({"M", "K", "N"});

static const std::array<int64_t, 17> kDims = {
1, 2, 4, 8, 16, 32, 64, 128,
256, 512, 1024, 2048, 4096, 8192, 16384, 65536};
const int64_t kMinK = 8;
static const std::array<int64_t, 18> kDims = {
1, 2, 4, 8, 16, 32, 64, 128, 256,
512, 1024, 2048, 4096, 8192, 16384, 32768, 65536};
const int64_t kMinK = 1024;
const int64_t kMinM = 32;
const int64_t kMaxSmall = 16;
const int64_t kMinHuge = 1024;
const int64_t kMinFLOPs = (int64_t)1 << 16;
const int64_t kMaxFLOPs = (int64_t)1 << 30;

for (int64_t m : kDims) {
if (m < kMinM) {
continue;
}
for (int64_t k : kDims) {
if (k < kMinK) {
continue;
Expand Down
14 changes: 14 additions & 0 deletions src/configs/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
load(
"//:build_defs.bzl",
"xnnpack_cc_library",
"xnnpack_cxx_library",
"xnnpack_select_if",
)

Expand Down Expand Up @@ -48,11 +49,24 @@ xnnpack_cc_library(
],
)

xnnpack_cxx_library(
name = "hardware_utils",
srcs = ["hardware_utils.cc"],
hdrs = ["hardware_utils.h"],
textual_hdrs = ["//:src/xnnpack/hardware-config.h"],
deps = [
"//:common",
"//:logging",
"@com_google_benchmark//:benchmark",
],
)

xnnpack_cc_library(
name = "hardware_config",
srcs = ["hardware-config.c"],
hdrs = ["//:src/xnnpack/hardware-config.h"],
deps = [
":hardware_utils",
"//:common",
"//:init_once",
"//:logging",
Expand Down
2 changes: 2 additions & 0 deletions src/configs/hardware-config.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include "src/configs/hardware_utils.h"

#if XNN_ENABLE_CPUINFO
#include <cpuinfo.h>
Expand Down Expand Up @@ -462,6 +463,7 @@ static void init_hardware_config(void) {
hardware_config.uarch[i] = xnn_uarch_unknown;
}
#endif // XNN_ENABLE_CPUINFO
xnn_set_cache_data(&hardware_config);
}

const struct xnn_hardware_config* xnn_init_hardware_config() {
Expand Down
35 changes: 35 additions & 0 deletions src/configs/hardware_utils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include "src/configs/hardware_utils.h"

#include "src/xnnpack/hardware-config.h"
#include "src/xnnpack/log.h"
#include <benchmark/benchmark.h>

bool xnn_set_cache_data(struct xnn_hardware_config* hardware_config) {
// Get the CPUInfo.
const benchmark::CPUInfo& cpu_info = benchmark::CPUInfo::Get();

// Populate the `hardware_config` fields with it.
for (const auto& cache : cpu_info.caches) {
if (cache.level == 1 && (cache.type == "Data" || cache.type == "Unified")) {
hardware_config->l1_data_cache_bytes = cache.size;
xnn_log_info(
"l1_data_cache_bytes=%zu, l1_data_cache_line_size=%zu, "
"l1_data_cache_associativity=%zu, l1_data_cache_num_sets=%zu.",
hardware_config->l1_data_cache_bytes,
hardware_config->l1_data_cache_line_size,
hardware_config->l1_data_cache_associativity,
hardware_config->l1_data_cache_num_sets);
} else if (cache.level == 2 &&
(cache.type == "Data" || cache.type == "Unified")) {
hardware_config->l2_data_cache_bytes = cache.size;
xnn_log_info(
"l2_data_cache_bytes=%zu, l2_data_cache_line_size=%zu, "
"l2_data_cache_associativity=%zu, l2_data_cache_num_sets=%zu.",
hardware_config->l2_data_cache_bytes,
hardware_config->l2_data_cache_line_size,
hardware_config->l2_data_cache_associativity,
hardware_config->l2_data_cache_num_sets);
}
}
return true;
}
18 changes: 18 additions & 0 deletions src/configs/hardware_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#ifndef XNNPACK_SRC_CONFIGS_HARDWARE_UTILS_H_
#define XNNPACK_SRC_CONFIGS_HARDWARE_UTILS_H_

#include "src/xnnpack/common.h"
#include "src/xnnpack/hardware-config.h"

#ifdef __cplusplus
extern "C" {
#endif

XNN_INTERNAL bool xnn_set_cache_data(
struct xnn_hardware_config* hardware_config);

#ifdef __cplusplus
}
#endif

#endif // XNNPACK_SRC_CONFIGS_HARDWARE_UTILS_H_
20 changes: 9 additions & 11 deletions src/microkernel-utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,17 +155,15 @@ size_t xnn_gemm_best_tile_size(size_t num_groups, size_t m, size_t n,

// Checks whether to use the `nr2` config or not.
bool xnn_use_nr2(size_t nr, size_t nr2, size_t output_channels) {
size_t nr_overcompute = (nr - output_channels % nr) % nr;
size_t nr2_overcompute = (nr2 - output_channels % nr2) % nr2;
// Switch to alternative microkernel when:
// 1. Alternative microkernel better supports fewer output channels, or
// 2. Alternative microkernel has less overcompute and default wastes >1% of
// output channels
if (nr > output_channels || (nr2_overcompute < nr_overcompute &&
nr_overcompute * 100 > output_channels)) {
// Default microkernel is suboptimal, use a microkernel that better
// supports fewer output channels.
return true;
if (nr > output_channels) {
size_t nr_overcompute = (nr - output_channels % nr) % nr;
size_t nr2_overcompute = (nr2 - output_channels % nr2) % nr2;
// Switch to alternative microkernel when:
// 1. Alternative microkernel better supports fewer output channels, or
// 2. Alternative microkernel has less overcompute and default wastes >1% of
// output channels
return nr2_overcompute < nr_overcompute &&
nr_overcompute * 100 > output_channels;
}
return false;
}
77 changes: 67 additions & 10 deletions src/subgraph.c
Original file line number Diff line number Diff line change
Expand Up @@ -3621,6 +3621,57 @@ static enum xnn_status optimize_common_subgraphs_gemm_rhs_transpose(
return xnn_status_success;
}

// Converts batch-matrix-multiply nodes with 2D weights to fully-connected nodes
// for consistency.
static enum xnn_status optimize_common_subgraphs_bmm_to_fc(
xnn_subgraph_t subgraph, uint32_t node_id, size_t* changes) {
struct xnn_node* node = &subgraph->nodes[node_id];
if (node->type != xnn_node_type_batch_matrix_multiply) {
return xnn_status_success;
}

const uint32_t input_a_id = node->inputs[0];
const uint32_t input_b_id = node->inputs[1];
const uint32_t output_id = node->outputs[0];
struct xnn_value* input_b_value = &subgraph->values[input_b_id];
const enum xnn_datatype packed_input_datatype = node->packed_input_datatype;

// Weights should have at least two dimensions, and batch dimensions
// should all be 1.
if (input_b_value->shape.num_dims != 2) {
return xnn_status_success;
}

// If the weights are dynamic, restrict to fp32/fp16.
if (!xnn_value_is_static(input_b_value->allocation_type) &&
!(input_b_value->datatype == xnn_datatype_fp32 ||
input_b_value->datatype == xnn_datatype_fp16)) {
return xnn_status_success;
}

// Replace with a fully-connected node.
XNN_RETURN_IF_ERROR(
xnn_define_fully_connected(
subgraph,
/*output_min=*/-INFINITY, /*output_max=*/INFINITY, input_a_id,
input_b_id, /*bias_id=*/XNN_INVALID_VALUE_ID, output_id,
node->flags ^ XNN_FLAG_TRANSPOSE_WEIGHTS),
"Failed to create new `fully_connected` node.");
node = &subgraph->nodes[node_id];
*node = subgraph->nodes[--subgraph->num_nodes];
node->id = node_id;
node->packed_input_datatype = packed_input_datatype;
subgraph->values[input_a_id].flags |= XNN_FLAG_SQUASH_GROUPS;

xnn_log_info(
"Converted batch_matrix_multiply[#%u](v%03u, v%03u) to "
"fully_connected[#%u](v%03u, v%03u).",
node_id, input_a_id, input_b_id, node_id, input_a_id, input_b_id);
(*changes)++;

return xnn_status_success;
}

static enum xnn_status optimize_common_subgraphs_iter(
xnn_subgraph_t subgraph, uint32_t optimization_flags, size_t* changes) {
// Loop over the nodes in this subgraph.
Expand Down Expand Up @@ -3739,8 +3790,14 @@ static enum xnn_status optimize_common_subgraphs_iter(
// be pushed back to the static value.
break;

case xnn_node_type_fully_connected:
case xnn_node_type_batch_matrix_multiply:
// Convert batch-matrix-multiply nodes with 2D weights to
// fully-connected nodes for consistency.
XNN_RETURN_IF_ERROR(
optimize_common_subgraphs_bmm_to_fc(subgraph, node_id, changes));
XNN_FALLTHROUGH

case xnn_node_type_fully_connected:
// Merge or remove transposes of the RHS of a batch-matrix-multiply or
// fully-connected op.
XNN_RETURN_IF_ERROR(optimize_common_subgraphs_gemm_rhs_transpose(
Expand Down Expand Up @@ -3907,8 +3964,8 @@ enum xnn_status xnn_subgraph_optimize_packed_lhs(xnn_subgraph_t subgraph,
input_id, xnn_node_type_to_string(xnn_node_type_convert),
xnn_datatype_to_string(input_datatype),
xnn_datatype_to_string(xnn_datatype_qpint8));
subgraph->values[input_id].datatype = assumed_datatype;
subgraph->values[input_id].gemm_config = gemm_config;
input_value->datatype = assumed_datatype;
input_value->gemm_config = gemm_config;
} else {
// Insert a node to pack the LHS.
xnn_log_debug(
Expand All @@ -3920,15 +3977,15 @@ enum xnn_status xnn_subgraph_optimize_packed_lhs(xnn_subgraph_t subgraph,
uint32_t new_id = XNN_INVALID_VALUE_ID;
XNN_RETURN_IF_ERROR(
xnn_insert_pack_lh_node(subgraph, input_id, &new_id));
subgraph->nodes[node_id].inputs[0] = new_id;
node = &subgraph->nodes[node_id];
node->inputs[0] = new_id;
changes++;
}
// If this is a fully-connected op, we need to coerce the shape of
// the inputs from `[B, M, K]` to `[B * M, K]` to avoid batch-wise
// packing.
if (node->type == xnn_node_type_fully_connected) {
subgraph->values[subgraph->nodes[node_id].inputs[0]].flags |=
XNN_FLAG_SQUASH_GROUPS;
subgraph->values[node->inputs[0]].flags |= XNN_FLAG_SQUASH_GROUPS;
}
} else {
if (input_datatype == xnn_datatype_qdint8) {
Expand Down Expand Up @@ -4178,10 +4235,6 @@ enum xnn_status xnn_subgraph_optimize(xnn_subgraph_t subgraph,
return xnn_status_unsupported_hardware;
}

// Apply some common subgraph optimizations.
XNN_RETURN_IF_ERROR(
xnn_subgraph_optimize_common_subgraphs(subgraph, optimization_flags));

if ((optimization_flags & XNN_FLAG_FORCE_FP16_INFERENCE) &&
(!xnn_is_f16_compatible_config(hardware_config))) {
xnn_log_error(
Expand Down Expand Up @@ -4234,6 +4287,10 @@ enum xnn_status xnn_subgraph_optimize(xnn_subgraph_t subgraph,
XNN_RETURN_IF_ERROR(
xnn_subgraph_optimize_packed_lhs(subgraph, optimization_flags));

// Apply some common subgraph optimizations.
XNN_RETURN_IF_ERROR(
xnn_subgraph_optimize_common_subgraphs(subgraph, optimization_flags));

return xnn_status_success;
}

Expand Down
Loading
Loading