google · copybara-service · Nov 7, 2025
diff --git a/bench/subgraph/fully-connected.cc b/bench/subgraph/fully-connected.cc
@@ -202,16 +202,20 @@ BENCHMARK(QD8FullyConnected)
 static void FullyConnectedArgs(benchmark::internal::Benchmark* b) {
   b->ArgNames({"M", "K", "N"});
 
-  static const std::array<int64_t, 17> kDims = {
-      1,   2,   4,    8,    16,   32,   64,    128,
-      256, 512, 1024, 2048, 4096, 8192, 16384, 65536};
-  const int64_t kMinK = 8;
+  static const std::array<int64_t, 18> kDims = {
+      1,   2,    4,    8,    16,   32,    64,    128,  256,
+      512, 1024, 2048, 4096, 8192, 16384, 32768, 65536};
+  const int64_t kMinK = 1024;
+  const int64_t kMinM = 32;
   const int64_t kMaxSmall = 16;
   const int64_t kMinHuge = 1024;
   const int64_t kMinFLOPs = (int64_t)1 << 16;
   const int64_t kMaxFLOPs = (int64_t)1 << 30;
 
   for (int64_t m : kDims) {
+    if (m < kMinM) {
+      continue;
+    }
     for (int64_t k : kDims) {
       if (k < kMinK) {
         continue;

diff --git a/src/configs/BUILD b/src/configs/BUILD
@@ -6,6 +6,7 @@
 load(
     "//:build_defs.bzl",
     "xnnpack_cc_library",
+    "xnnpack_cxx_library",
     "xnnpack_select_if",
 )
 
@@ -48,11 +49,24 @@ xnnpack_cc_library(
     ],
 )
 
+xnnpack_cxx_library(
+    name = "hardware_utils",
+    srcs = ["hardware_utils.cc"],
+    hdrs = ["hardware_utils.h"],
+    textual_hdrs = ["//:src/xnnpack/hardware-config.h"],
+    deps = [
+        "//:common",
+        "//:logging",
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
 xnnpack_cc_library(
     name = "hardware_config",
     srcs = ["hardware-config.c"],
     hdrs = ["//:src/xnnpack/hardware-config.h"],
     deps = [
+        ":hardware_utils",
         "//:common",
         "//:init_once",
         "//:logging",

diff --git a/src/configs/hardware-config.c b/src/configs/hardware-config.c
@@ -6,6 +6,7 @@
 #include <assert.h>
 #include <stddef.h>
 #include <stdint.h>
+#include "src/configs/hardware_utils.h"
 
 #if XNN_ENABLE_CPUINFO
 #include <cpuinfo.h>
@@ -462,6 +463,7 @@ static void init_hardware_config(void) {
     hardware_config.uarch[i] = xnn_uarch_unknown;
   }
 #endif  // XNN_ENABLE_CPUINFO
+  xnn_set_cache_data(&hardware_config);
 }
 
 const struct xnn_hardware_config* xnn_init_hardware_config() {

diff --git a/src/configs/hardware_utils.cc b/src/configs/hardware_utils.cc
@@ -0,0 +1,35 @@
+#include "src/configs/hardware_utils.h"
+
+#include "src/xnnpack/hardware-config.h"
+#include "src/xnnpack/log.h"
+#include <benchmark/benchmark.h>
+
+bool xnn_set_cache_data(struct xnn_hardware_config* hardware_config) {
+  // Get the CPUInfo.
+  const benchmark::CPUInfo& cpu_info = benchmark::CPUInfo::Get();
+
+  // Populate the `hardware_config` fields with it.
+  for (const auto& cache : cpu_info.caches) {
+    if (cache.level == 1 && (cache.type == "Data" || cache.type == "Unified")) {
+      hardware_config->l1_data_cache_bytes = cache.size;
+      xnn_log_info(
+          "l1_data_cache_bytes=%zu, l1_data_cache_line_size=%zu, "
+          "l1_data_cache_associativity=%zu, l1_data_cache_num_sets=%zu.",
+          hardware_config->l1_data_cache_bytes,
+          hardware_config->l1_data_cache_line_size,
+          hardware_config->l1_data_cache_associativity,
+          hardware_config->l1_data_cache_num_sets);
+    } else if (cache.level == 2 &&
+               (cache.type == "Data" || cache.type == "Unified")) {
+      hardware_config->l2_data_cache_bytes = cache.size;
+      xnn_log_info(
+          "l2_data_cache_bytes=%zu, l2_data_cache_line_size=%zu, "
+          "l2_data_cache_associativity=%zu, l2_data_cache_num_sets=%zu.",
+          hardware_config->l2_data_cache_bytes,
+          hardware_config->l2_data_cache_line_size,
+          hardware_config->l2_data_cache_associativity,
+          hardware_config->l2_data_cache_num_sets);
+    }
+  }
+  return true;
+}
diff --git a/src/configs/hardware_utils.h b/src/configs/hardware_utils.h
@@ -0,0 +1,18 @@
+#ifndef XNNPACK_SRC_CONFIGS_HARDWARE_UTILS_H_
+#define XNNPACK_SRC_CONFIGS_HARDWARE_UTILS_H_
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/hardware-config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+XNN_INTERNAL bool xnn_set_cache_data(
+    struct xnn_hardware_config* hardware_config);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XNNPACK_SRC_CONFIGS_HARDWARE_UTILS_H_
diff --git a/src/microkernel-utils.c b/src/microkernel-utils.c
@@ -155,17 +155,15 @@ size_t xnn_gemm_best_tile_size(size_t num_groups, size_t m, size_t n,
 
 // Checks whether to use the `nr2` config or not.
 bool xnn_use_nr2(size_t nr, size_t nr2, size_t output_channels) {
-  size_t nr_overcompute = (nr - output_channels % nr) % nr;
-  size_t nr2_overcompute = (nr2 - output_channels % nr2) % nr2;
-  // Switch to alternative microkernel when:
-  // 1. Alternative microkernel better supports fewer output channels, or
-  // 2. Alternative microkernel has less overcompute and default wastes >1% of
-  // output channels
-  if (nr > output_channels || (nr2_overcompute < nr_overcompute &&
-                               nr_overcompute * 100 > output_channels)) {
-    // Default microkernel is suboptimal, use a microkernel that better
-    // supports fewer output channels.
-    return true;
+  if (nr > output_channels) {
+    size_t nr_overcompute = (nr - output_channels % nr) % nr;
+    size_t nr2_overcompute = (nr2 - output_channels % nr2) % nr2;
+    // Switch to alternative microkernel when:
+    // 1. Alternative microkernel better supports fewer output channels, or
+    // 2. Alternative microkernel has less overcompute and default wastes >1% of
+    // output channels
+    return nr2_overcompute < nr_overcompute &&
+           nr_overcompute * 100 > output_channels;
   }
   return false;
 }
diff --git a/src/subgraph.c b/src/subgraph.c
@@ -3621,6 +3621,57 @@ static enum xnn_status optimize_common_subgraphs_gemm_rhs_transpose(
   return xnn_status_success;
 }
 
+// Converts batch-matrix-multiply nodes with 2D weights to fully-connected nodes
+// for consistency.
+static enum xnn_status optimize_common_subgraphs_bmm_to_fc(
+    xnn_subgraph_t subgraph, uint32_t node_id, size_t* changes) {
+  struct xnn_node* node = &subgraph->nodes[node_id];
+  if (node->type != xnn_node_type_batch_matrix_multiply) {
+    return xnn_status_success;
+  }
+
+  const uint32_t input_a_id = node->inputs[0];
+  const uint32_t input_b_id = node->inputs[1];
+  const uint32_t output_id = node->outputs[0];
+  struct xnn_value* input_b_value = &subgraph->values[input_b_id];
+  const enum xnn_datatype packed_input_datatype = node->packed_input_datatype;
+
+  // Weights should have at least two dimensions, and batch dimensions
+  // should all be 1.
+  if (input_b_value->shape.num_dims != 2) {
+    return xnn_status_success;
+  }
+
+  // If the weights are dynamic, restrict to fp32/fp16.
+  if (!xnn_value_is_static(input_b_value->allocation_type) &&
+      !(input_b_value->datatype == xnn_datatype_fp32 ||
+        input_b_value->datatype == xnn_datatype_fp16)) {
+    return xnn_status_success;
+  }
+
+  // Replace with a fully-connected node.
+  XNN_RETURN_IF_ERROR(
+      xnn_define_fully_connected(
+          subgraph,
+          /*output_min=*/-INFINITY, /*output_max=*/INFINITY, input_a_id,
+          input_b_id, /*bias_id=*/XNN_INVALID_VALUE_ID, output_id,
+          node->flags ^ XNN_FLAG_TRANSPOSE_WEIGHTS),
+      "Failed to create new `fully_connected` node.");
+  node = &subgraph->nodes[node_id];
+  *node = subgraph->nodes[--subgraph->num_nodes];
+  node->id = node_id;
+  node->packed_input_datatype = packed_input_datatype;
+  subgraph->values[input_a_id].flags |= XNN_FLAG_SQUASH_GROUPS;
+
+  xnn_log_info(
+      "Converted batch_matrix_multiply[#%u](v%03u, v%03u) to "
+      "fully_connected[#%u](v%03u, v%03u).",
+      node_id, input_a_id, input_b_id, node_id, input_a_id, input_b_id);
+  (*changes)++;
+
+  return xnn_status_success;
+}
+
 static enum xnn_status optimize_common_subgraphs_iter(
     xnn_subgraph_t subgraph, uint32_t optimization_flags, size_t* changes) {
   // Loop over the nodes in this subgraph.
@@ -3739,8 +3790,14 @@ static enum xnn_status optimize_common_subgraphs_iter(
         // be pushed back to the static value.
         break;
 
-      case xnn_node_type_fully_connected:
       case xnn_node_type_batch_matrix_multiply:
+        // Convert batch-matrix-multiply nodes with 2D weights to
+        // fully-connected nodes for consistency.
+        XNN_RETURN_IF_ERROR(
+            optimize_common_subgraphs_bmm_to_fc(subgraph, node_id, changes));
+        XNN_FALLTHROUGH
+
+      case xnn_node_type_fully_connected:
         // Merge or remove transposes of the RHS of a batch-matrix-multiply or
         // fully-connected op.
         XNN_RETURN_IF_ERROR(optimize_common_subgraphs_gemm_rhs_transpose(
@@ -3907,8 +3964,8 @@ enum xnn_status xnn_subgraph_optimize_packed_lhs(xnn_subgraph_t subgraph,
                   input_id, xnn_node_type_to_string(xnn_node_type_convert),
                   xnn_datatype_to_string(input_datatype),
                   xnn_datatype_to_string(xnn_datatype_qpint8));
-              subgraph->values[input_id].datatype = assumed_datatype;
-              subgraph->values[input_id].gemm_config = gemm_config;
+              input_value->datatype = assumed_datatype;
+              input_value->gemm_config = gemm_config;
             } else {
               // Insert a node to pack the LHS.
               xnn_log_debug(
@@ -3920,15 +3977,15 @@ enum xnn_status xnn_subgraph_optimize_packed_lhs(xnn_subgraph_t subgraph,
               uint32_t new_id = XNN_INVALID_VALUE_ID;
               XNN_RETURN_IF_ERROR(
                   xnn_insert_pack_lh_node(subgraph, input_id, &new_id));
-              subgraph->nodes[node_id].inputs[0] = new_id;
+              node = &subgraph->nodes[node_id];
+              node->inputs[0] = new_id;
               changes++;
             }
             // If this is a fully-connected op, we need to coerce the shape of
             // the inputs from `[B, M, K]` to `[B * M, K]` to avoid batch-wise
             // packing.
             if (node->type == xnn_node_type_fully_connected) {
-              subgraph->values[subgraph->nodes[node_id].inputs[0]].flags |=
-                  XNN_FLAG_SQUASH_GROUPS;
+              subgraph->values[node->inputs[0]].flags |= XNN_FLAG_SQUASH_GROUPS;
             }
           } else {
             if (input_datatype == xnn_datatype_qdint8) {
@@ -4178,10 +4235,6 @@ enum xnn_status xnn_subgraph_optimize(xnn_subgraph_t subgraph,
     return xnn_status_unsupported_hardware;
   }
 
-  // Apply some common subgraph optimizations.
-  XNN_RETURN_IF_ERROR(
-      xnn_subgraph_optimize_common_subgraphs(subgraph, optimization_flags));
-
   if ((optimization_flags & XNN_FLAG_FORCE_FP16_INFERENCE) &&
       (!xnn_is_f16_compatible_config(hardware_config))) {
     xnn_log_error(
@@ -4234,6 +4287,10 @@ enum xnn_status xnn_subgraph_optimize(xnn_subgraph_t subgraph,
   XNN_RETURN_IF_ERROR(
       xnn_subgraph_optimize_packed_lhs(subgraph, optimization_flags));
 
+  // Apply some common subgraph optimizations.
+  XNN_RETURN_IF_ERROR(
+      xnn_subgraph_optimize_common_subgraphs(subgraph, optimization_flags));
+
   return xnn_status_success;
 }