From af57ffa0f6a769c8c681c04f2098017b7749bc5f Mon Sep 17 00:00:00 2001
From: Francesco Evangelista <francesco.evangelista@gmail.com>
Date: Wed, 24 Jun 2020 22:15:47 -0400
Subject: [PATCH 01/10] Started to work on adding TBLIS to ambit

---
 CMakeLists.txt          |   11 +-
 src/tensor/core/core.cc |   58 +-
 src/tensor/indices.cc   |   20 +-
 test/CMakeLists.txt     |   10 +
 test/test_tblis.cc      | 3154 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 3240 insertions(+), 13 deletions(-)
 create mode 100644 test/test_tblis.cc
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1bdcb05..64b2770 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ set(ambit_DESCRIPTION  "C++ library for the implementation of tensor product cal
 set(ambit_URL          "https://github.com/jturney/ambit")
 set(ambit_LICENSE      "GNU Lesser General Public License v3 (LGPL-3.0)")
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(TargetOpenMP_FIND_COMPONENTS "CXX")
 
@@ -56,9 +56,11 @@ option (SHARED_ONLY          "Compile only the shared library"         OFF)
 option (ENABLE_TESTS         "Compile the tests"                       ON)
 option (WITH_MPI             "Build the library with MPI"              OFF)
 option (ENABLE_CYCLOPS       "Enable Cyclops usage" OFF)
+option (ENABLE_TBLIS         "Enable TBLIS usage" OFF)
 option (BUILD_FPIC           "Static library in STATIC_ONLY will be compiled with position independent code" ON)
 option (CYCLOPS              "Location of the Cyclops build directory" "")
 option (ELEMENTAL            "Location of the Elemental build directory" "")
+option (TBLIS                "Location of the TBLIS build directory" "")
 option_with_print(ENABLE_OPENMP "Enable OpenMP parallelization" ON)
 option_with_flags(ENABLE_XHOST "Enables processor-specific optimization" ON
                   "-xHost" "-march=native")
@@ -133,6 +135,13 @@ if(ENABLE_CYCLOPS AND CYCLOPS)
     include_directories(${CYCLOPS}/include)
     add_definitions(-DHAVE_CYCLOPS)
 endif()
+if(ENABLE_TBLIS AND TBLIS)
+    set(CMAKE_CXX_STANDARD 17)
+    include_directories(${TBLIS}/include)
+    add_definitions(-DHAVE_TBLIS)
+    link_directories("/Users/fevange/Bin/tblis/lib")
+    link_libraries("${TBLIS}/lib/libtblis.a" "${TBLIS}/lib/libtci.a")
+endif()
 if (ENABLE_ELEMENTAL AND ELEMENTAL)
     include_directories(${ELEMENTAL}/include)
     add_definitions(-DHAVE_ELEMENTAL)
diff --git a/src/tensor/core/core.cc b/src/tensor/core/core.cc
index 6bcc763..5cd2f9e 100644
--- a/src/tensor/core/core.cc
+++ b/src/tensor/core/core.cc
@@ -28,6 +28,10 @@
  * @END LICENSE
  */
 
+#ifdef HAVE_TBLIS
+#include <tblis/tblis.h>
+#endif
+
 #include "core.h"
 #include "math/math.h"
 #include "tensor/indices.h"
@@ -35,12 +39,11 @@
 #include <ambit/print.h>
 #include <ambit/timer.h>
 #include <cmath>
+#include <iostream>
 #include <limits>
 #include <stdexcept>
 #include <string.h>
 
-//#include <boost/timer/timer.hpp>
-
 namespace ambit
 {
 
@@ -163,7 +166,58 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
     shared_ptr<TensorImpl> A2;
     shared_ptr<TensorImpl> B2;
     shared_ptr<TensorImpl> C2;
+#ifdef HAVE_TBLIS
+    std::cout << "Calling TBLIS" << std::endl;
+    size_t Asize = Ainds.size();
+    size_t Bsize = Binds.size();
+    size_t Csize = Cinds.size();
+
+    std::string indices_A = indices::to_string(Ainds, ",");
+    std::string indices_B = indices::to_string(Binds, ",");
+    std::string indices_C = indices::to_string(Cinds, ",");
+
+    std::cout << "\n indices_A " << indices_A << std::endl;
+    std::cout << "\n indices_B " << indices_B << std::endl;
+    std::cout << "\n indices_C " << indices_C << std::endl;
+
+    // Scalar multiplication
+    if ((Asize == 0) and (Bsize == 0) and (Csize == 0))
+    {
+        data_[0] = alpha * A->data()[0] * B->data()[0] + beta * data_[0];
+    }
+    // Index Type AB -> dot operation
+    else if ((Cinds.size() == 0) and (Ainds.size() > 0) and (Binds.size() > 0))
+    {
+        TensorImplPtr Ap = const_cast<TensorImplPtr>(A);
+        TensorImplPtr Bp = const_cast<TensorImplPtr>(B);
+        MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
+        MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
+        data_[0] = alpha * tblis::dot<double>(A_v, indices_A.c_str(), B_v,
+                                              indices_B.c_str()) +
+                   beta * data_[0];
+    }
+    // Index Type ABB -> multp operation
+    else if ((Ainds.size() > 0) and (Binds.size() > 0) and (Cinds.size() > 0))
+    {
+        std::cout << "\n Calling mult " << std::endl;
+        std::cout << "\n Get Ap " << std::endl;
+        TensorImplPtr Ap = const_cast<TensorImplPtr>(A);
+        std::cout << "\n Get Bp " << std::endl;
+        TensorImplPtr Bp = const_cast<TensorImplPtr>(B);
+        std::cout << "\n Get A_v " << std::endl;
+        MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
+        std::cout << "\n Get B_v " << std::endl;
+        MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
+        std::cout << "\n Get C_v " << std::endl;
+        MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
+        std::cout << "\n CAll mult " << std::endl;
+        tblis::mult<double>(alpha, A_v, indices_A.c_str(), B_v,
+                            indices_B.c_str(), beta, C_v, indices_C.c_str());
+    }
+#else
     contract(A, B, Cinds, Ainds, Binds, A2, B2, C2, alpha, beta);
+
+#endif
 }
 
 void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
diff --git a/src/tensor/indices.cc b/src/tensor/indices.cc
index 472c273..f477e49 100644
--- a/src/tensor/indices.cc
+++ b/src/tensor/indices.cc
@@ -20,18 +20,18 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU Lesser General Public License along
- * with ambit; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with ambit; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * @END LICENSE
  */
 
 #include <ambit/common_types.h>
 
+#include "indices.h"
 #include <algorithm>
 #include <stdexcept>
-#include "indices.h"
 
 namespace ambit
 {
@@ -45,9 +45,9 @@ namespace
 // trim from start
 static inline string &ltrim(string &s)
 {
-    s.erase(s.begin(),
-            std::find_if(s.begin(), s.end(),
-                         std::not1(std::ptr_fun<int, int>(std::isspace))));
+    s.erase(s.begin(), std::find_if(s.begin(), s.end(),
+                                    [](int c) { return !std::isspace(c); }));
+
     return s;
 }
 
@@ -55,7 +55,7 @@ static inline string &ltrim(string &s)
 static inline string &rtrim(string &s)
 {
     s.erase(std::find_if(s.rbegin(), s.rend(),
-                         std::not1(std::ptr_fun<int, int>(std::isspace)))
+                         [](int c) { return !std::isspace(c); })
                 .base(),
             s.end());
     return s;
@@ -63,7 +63,7 @@ static inline string &rtrim(string &s)
 
 // trim from both ends
 static inline string &trim(string &s) { return ltrim(rtrim(s)); }
-}
+} // namespace
 
 Indices split(const string &indices)
 {
@@ -247,4 +247,4 @@ vector<Indices> determine_contraction_result(const LabeledTensor &A,
 
 } // namespace  indices
 
-} // namespace tensor
+} // namespace ambit
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 907a625..891620e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,3 +1,8 @@
+if(ENABLE_TBLIS)
+    set(CMAKE_CXX_STANDARD 17)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()
+
 set(TEST_OPERATORS_SOURCES
     test_operators.cc
 )
@@ -34,6 +39,11 @@ if(ENABLE_CYCLOPS)
     add_test(NAME distributed COMMAND test_distributed)
 endif()
 
+if(ENABLE_TBLIS)
+    add_executable(test_tblis test_tblis.cc)
+    target_link_libraries(test_tblis ambit-lib)
+    add_test(NAME tblis COMMAND test_tblis)
+endif()
 #set(TEST_HF_SOURCES
 #    test_hf.cc
 #)
diff --git a/test/test_tblis.cc b/test/test_tblis.cc
new file mode 100644
index 0000000..628fa96
--- /dev/null
+++ b/test/test_tblis.cc
@@ -0,0 +1,3154 @@
+/*
+ * @BEGIN LICENSE
+ *
+ * ambit: C++ library for the implementation of tensor product calculations
+ *        through a clean, concise user interface.
+ *
+ * Copyright (c) 2014-2017 Ambit developers.
+ *
+ * The copyrights for code used from other parties are included in
+ * the corresponding files.
+ *
+ * This file is part of ambit.
+ *
+ * Ambit is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, version 3.
+ *
+ * Ambit is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with ambit; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * @END LICENSE
+ */
+
+#include <ambit/tensor.h>
+#include <cmath>
+#include <cstdlib>
+#include <stdexcept>
+
+#define ANSI_COLOR_RED "\x1b[31m"
+#define ANSI_COLOR_GREEN "\x1b[32m"
+#define ANSI_COLOR_YELLOW "\x1b[33m"
+#define ANSI_COLOR_BLUE "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN "\x1b[36m"
+#define ANSI_COLOR_RESET "\x1b[0m"
+
+using namespace ambit;
+
+/// Expected relative accuracy for numerical exactness
+const double epsilon = 1.0E-14;
+
+/// Scheme to categorize expected vs. actual op behavior
+enum TestResult
+{
+    kExact,     // Some op occurred and the results match exactly
+    kEpsilon,   // Some op occurred and the results match to something
+                // proportional to epsilon
+    kDeviation, // Some op occurred and the results deviate significantly from
+                // epsilon
+    kException  // Some op occurred and the op threw an exception
+};
+
+/// Global alpha parameter
+double alpha = 1.0;
+/// Global beta parameter
+double beta = 0.0;
+/// 0 - explicit call, 1 - = OO, 2 - += OO, 3 - -= OO
+int mode = 0;
+
+std::string test_result_string(TestResult type)
+{
+    switch (type)
+    {
+    case kExact:
+        return "Exact";
+        break;
+    case kEpsilon:
+        return "Epsilon";
+        break;
+    case kDeviation:
+        return "Deviation";
+        break;
+    case kException:
+        return "Exception";
+        break;
+    default:
+        return "Unknown";
+    }
+}
+
+/// Try a function which returns a double against an expected behavior
+bool test_function(double (*trial_function)(), const std::string &label,
+                   TestResult expected)
+{
+    TestResult observed = kDeviation;
+    double delta = 0.0;
+    std::string exception;
+
+    try
+    {
+        delta = trial_function();
+        if (delta == 0.0 && expected != kEpsilon)
+            observed = kExact;
+        else if (delta <= epsilon)
+            observed = kEpsilon;
+        else
+            observed = kDeviation;
+    }
+    catch (std::exception &e)
+    {
+        observed = kException;
+        exception = e.what();
+    }
+
+    bool pass = observed == expected;
+    if (pass)
+        printf(ANSI_COLOR_GREEN);
+    else
+        printf(ANSI_COLOR_RED);
+
+    printf("%-50s ", label.c_str());
+    printf("%-9s ", test_result_string(expected).c_str());
+    printf("%-9s ", test_result_string(observed).c_str());
+    printf("%11.3E\n", delta);
+
+    printf(ANSI_COLOR_RESET);
+
+    if (exception.size())
+        printf(" Exception: %s\n", exception.c_str());
+
+    return pass;
+}
+
+double random_double() { return double(std::rand()) / double(RAND_MAX); }
+
+/// Initialize A1 to some random fill
+void initialize_random(Tensor &A1)
+{
+    size_t numel1 = A1.numel();
+    std::vector<double> &A1v = A1.data();
+    for (size_t ind = 0L; ind < numel1; ind++)
+    {
+        double randnum = double(std::rand()) / double(RAND_MAX);
+        A1v[ind] = randnum;
+    }
+}
+
+/// Initialize A1 and A2 to the same random fill
+void initialize_random(Tensor &A1, Tensor &A2)
+{
+    size_t numel1 = A1.numel();
+    size_t numel2 = A2.numel();
+    if (numel1 != numel2)
+        throw std::runtime_error("Tensors do not have same numel.");
+    std::vector<double> &A1v = A1.data();
+    std::vector<double> &A2v = A2.data();
+    for (size_t ind = 0L; ind < numel1; ind++)
+    {
+        double randnum = double(std::rand()) / double(RAND_MAX);
+        A1v[ind] = randnum;
+        A2v[ind] = randnum;
+    }
+}
+
+/// Returns |A1 - A2|_INF / |A2|_INF or 0.0 if |A1 - A2|_INF == 0.0
+double relative_difference(const Tensor &A1, const Tensor &A2)
+{
+    size_t numel1 = A1.numel();
+    size_t numel2 = A2.numel();
+    if (numel1 != numel2)
+        throw std::runtime_error("Tensors do not have same numel.");
+    const std::vector<double> &A1v = A1.data();
+    const std::vector<double> &A2v = A2.data();
+    double dmax = 0.0;
+    double Dmax = 0.0;
+    for (size_t ind = 0L; ind < numel1; ind++)
+    {
+        double d = fabs(A1v[ind] - A2v[ind]);
+        double D = fabs(A2v[ind]);
+        dmax = std::max(d, dmax);
+        Dmax = std::max(D, Dmax);
+    }
+    if (dmax == 0.0)
+        return 0.0;
+    else
+        return dmax / Dmax;
+}
+
+double try_relative_difference()
+{
+    Dimension Adims = {4, 5, 6};
+    Tensor A1 = Tensor::build(CoreTensor, "A1", Adims);
+    Tensor A2 = Tensor::build(CoreTensor, "A2", Adims);
+    initialize_random(A1, A2);
+    return relative_difference(A1, A2);
+}
+
+double try_1_norm()
+{
+    Dimension Adims = {4, 5, 6};
+    Tensor A1 = Tensor::build(CoreTensor, "A1", Adims);
+    Tensor A2 = Tensor::build(CoreTensor, "A2", Adims);
+    initialize_random(A1, A2);
+
+    double normA1 = A2.norm(1);
+
+    double normA2 = 0.0;
+    size_t numel = A2.numel();
+    std::vector<double> &A2v = A2.data();
+    for (size_t ind = 0L; ind < numel; ind++)
+    {
+        normA2 += fabs(A2v[ind]);
+    }
+
+    double delta = fabs(normA1 - normA2);
+    if (delta == 0.0)
+        return 0.0;
+    else
+        return delta / fabs(normA1 + normA2);
+}
+
+double try_2_norm()
+{
+    Dimension Adims = {4, 5, 6};
+    Tensor A1 = Tensor::build(CoreTensor, "A1", Adims);
+    Tensor A2 = Tensor::build(CoreTensor, "A2", Adims);
+    initialize_random(A1, A2);
+
+    double normA1 = A2.norm(2);
+
+    double normA2 = 0.0;
+    size_t numel = A2.numel();
+    std::vector<double> &A2v = A2.data();
+    for (size_t ind = 0L; ind < numel; ind++)
+    {
+        normA2 += A2v[ind] * A2v[ind];
+    }
+    normA2 = sqrt(normA2);
+
+    double delta = fabs(normA1 - normA2);
+    if (delta == 0.0)
+        return 0.0;
+    else
+        return delta / fabs(normA1 + normA2);
+}
+
+double try_inf_norm()
+{
+    Dimension Adims = {4, 5, 6};
+    Tensor A1 = Tensor::build(CoreTensor, "A1", Adims);
+    Tensor A2 = Tensor::build(CoreTensor, "A2", Adims);
+    initialize_random(A1, A2);
+
+    double normA1 = A2.norm(0);
+
+    double normA2 = 0.0;
+    size_t numel = A2.numel();
+    std::vector<double> &A2v = A2.data();
+    for (size_t ind = 0L; ind < numel; ind++)
+    {
+        normA2 = std::max(normA2, A2v[ind]);
+    }
+
+    double delta = fabs(normA1 - normA2);
+    if (delta == 0.0)
+        return 0.0;
+    else
+        return delta / fabs(normA1 + normA2);
+}
+
+double try_zero()
+{
+    Dimension Adims = {4, 5, 6};
+    Tensor A1 = Tensor::build(CoreTensor, "A1", Adims);
+    Tensor A2 = Tensor::build(CoreTensor, "A2", Adims);
+    initialize_random(A1, A2);
+
+    A1.zero();
+
+    size_t numel = A2.numel();
+    std::vector<double> &A2v = A2.data();
+    for (size_t ind = 0L; ind < numel; ind++)
+    {
+        A2v[ind] = 0.0;
+    }
+
+    return relative_difference(A1, A2);
+}
+
+double try_copy()
+{
+    Dimension Adims = {4, 5, 6};
+    Tensor A1 = Tensor::build(CoreTensor, "A1", Adims);
+    Tensor A2 = Tensor::build(CoreTensor, "A2", Adims);
+    initialize_random(A1, A2);
+
+    A1.zero();
+    A1.copy(A2);
+
+    return relative_difference(A1, A2);
+}
+
+double try_scale()
+{
+    Dimension Adims = {4, 5, 6};
+    Tensor A1 = Tensor::build(CoreTensor, "A1", Adims);
+    Tensor A2 = Tensor::build(CoreTensor, "A2", Adims);
+    initialize_random(A1, A2);
+
+    double s = random_double();
+    A1.scale(s);
+
+    size_t numel = A2.numel();
+    std::vector<double> &A2v = A2.data();
+    for (size_t ind = 0L; ind < numel; ind++)
+    {
+        A2v[ind] *= s;
+    }
+
+    return relative_difference(A1, A2);
+}
+
+double try_slice_rank0_same1()
+{
+    Dimension Cdims = {};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.slice(A, {}, {}, alpha, beta);
+    else if (mode == 1)
+        C1() = A();
+    else if (mode == 2)
+        C1() += A();
+    else if (mode == 3)
+        C1() -= A();
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    Cv[0] = alpha * Av[0] + beta * Cv[0];
+
+    return relative_difference(C1, C2);
+}
+
+double try_slice_rank1_same1()
+{
+    Dimension Cdims = {4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{0L, 4L}};
+    IndexRange Ainds = {{0L, 4L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1() = A();
+    else if (mode == 2)
+        C1() += A();
+    else if (mode == 3)
+        C1() -= A();
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        Cv[(i + Cinds[0][0])] =
+            alpha * Av[(i + Ainds[0][0])] + beta * Cv[(i + Cinds[0][0])];
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank1_same2()
+{
+    Dimension Cdims = {4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{1L, 3L}};
+    IndexRange Ainds = {{2L, 4L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        Cv[(i + Cinds[0][0])] =
+            alpha * Av[(i + Ainds[0][0])] + beta * Cv[(i + Cinds[0][0])];
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank1_diff1()
+{
+    Dimension Cdims = {4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{1L, 4L}};
+    IndexRange Ainds = {{0L, 3L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        Cv[(i + Cinds[0][0])] =
+            alpha * Av[(i + Ainds[0][0])] + beta * Cv[(i + Cinds[0][0])];
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank1_diff2()
+{
+    Dimension Cdims = {4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{2L, 4L}};
+    IndexRange Ainds = {{1L, 3L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        Cv[(i + Cinds[0][0])] =
+            alpha * Av[(i + Ainds[0][0])] + beta * Cv[(i + Cinds[0][0])];
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_slice_rank2_same1()
+{
+    Dimension Cdims = {4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{0L, 4L}, {0L, 5L}};
+    IndexRange Ainds = {{0L, 4L}, {0L, 5L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1() = A();
+    else if (mode == 2)
+        C1() += A();
+    else if (mode == 3)
+        C1() -= A();
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])] =
+                alpha * Av[(i + Ainds[0][0]) * Adims[1] + (j + Ainds[1][0])] +
+                beta * Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank2_same2()
+{
+    Dimension Cdims = {4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{1L, 3L}, {0L, 5L}};
+    IndexRange Ainds = {{2L, 4L}, {0L, 5L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])] =
+                alpha * Av[(i + Ainds[0][0]) * Adims[1] + (j + Ainds[1][0])] +
+                beta * Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank2_same3()
+{
+    Dimension Cdims = {4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{1L, 3L}, {2L, 5L}};
+    IndexRange Ainds = {{2L, 4L}, {1L, 4L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])] =
+                alpha * Av[(i + Ainds[0][0]) * Adims[1] + (j + Ainds[1][0])] +
+                beta * Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank2_diff1()
+{
+    Dimension Cdims = {4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{1L, 4L}, {1L, 5L}};
+    IndexRange Ainds = {{0L, 3L}, {0L, 4L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])] =
+                alpha * Av[(i + Ainds[0][0]) * Adims[1] + (j + Ainds[1][0])] +
+                beta * Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank2_diff2()
+{
+    Dimension Cdims = {4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{2L, 4L}, {1L, 5L}};
+    IndexRange Ainds = {{1L, 3L}, {0L, 4L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])] =
+                alpha * Av[(i + Ainds[0][0]) * Adims[1] + (j + Ainds[1][0])] +
+                beta * Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank2_diff3()
+{
+    Dimension Cdims = {4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{2L, 4L}, {2L, 5L}};
+    IndexRange Ainds = {{1L, 3L}, {1L, 4L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])] =
+                alpha * Av[(i + Ainds[0][0]) * Adims[1] + (j + Ainds[1][0])] +
+                beta * Cv[(i + Cinds[0][0]) * Cdims[1] + (j + Cinds[1][0])];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_slice_rank3_same1()
+{
+    Dimension Cdims = {4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 5, 6};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{0L, 4L}, {0L, 5L}, {0L, 6L}};
+    IndexRange Ainds = {{0L, 4L}, {0L, 5L}, {0L, 6L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1() = A();
+    else if (mode == 2)
+        C1() += A();
+    else if (mode == 3)
+        C1() -= A();
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            for (size_t k = 0; k < Cinds[2][1] - Cinds[2][0]; k++)
+            {
+                Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                   (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])] =
+                    alpha *
+                        Av[(i + Ainds[0][0]) * Adims[1] * Adims[2] +
+                           (j + Ainds[1][0]) * Adims[2] + (k + Ainds[2][0])] +
+                    beta * Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                              (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank3_same2()
+{
+    Dimension Cdims = {4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 5, 6};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{1L, 3L}, {0L, 5L}, {0L, 6L}};
+    IndexRange Ainds = {{2L, 4L}, {0L, 5L}, {0L, 6L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            for (size_t k = 0; k < Cinds[2][1] - Cinds[2][0]; k++)
+            {
+                Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                   (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])] =
+                    alpha *
+                        Av[(i + Ainds[0][0]) * Adims[1] * Adims[2] +
+                           (j + Ainds[1][0]) * Adims[2] + (k + Ainds[2][0])] +
+                    beta * Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                              (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank3_same3()
+{
+    Dimension Cdims = {4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 5, 6};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{1L, 3L}, {2L, 5L}, {0L, 6L}};
+    IndexRange Ainds = {{2L, 4L}, {1L, 4L}, {0L, 6L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            for (size_t k = 0; k < Cinds[2][1] - Cinds[2][0]; k++)
+            {
+                Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                   (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])] =
+                    alpha *
+                        Av[(i + Ainds[0][0]) * Adims[1] * Adims[2] +
+                           (j + Ainds[1][0]) * Adims[2] + (k + Ainds[2][0])] +
+                    beta * Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                              (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank3_same4()
+{
+    Dimension Cdims = {4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 5, 6};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{1L, 3L}, {2L, 5L}, {2L, 4L}};
+    IndexRange Ainds = {{2L, 4L}, {1L, 4L}, {3L, 5L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            for (size_t k = 0; k < Cinds[2][1] - Cinds[2][0]; k++)
+            {
+                Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                   (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])] =
+                    alpha *
+                        Av[(i + Ainds[0][0]) * Adims[1] * Adims[2] +
+                           (j + Ainds[1][0]) * Adims[2] + (k + Ainds[2][0])] +
+                    beta * Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                              (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank3_diff1()
+{
+    Dimension Cdims = {4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{1L, 4L}, {1L, 5L}, {1L, 6L}};
+    IndexRange Ainds = {{0L, 3L}, {0L, 4L}, {0L, 5L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            for (size_t k = 0; k < Cinds[2][1] - Cinds[2][0]; k++)
+            {
+                Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                   (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])] =
+                    alpha *
+                        Av[(i + Ainds[0][0]) * Adims[1] * Adims[2] +
+                           (j + Ainds[1][0]) * Adims[2] + (k + Ainds[2][0])] +
+                    beta * Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                              (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank3_diff2()
+{
+    Dimension Cdims = {4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{2L, 4L}, {1L, 5L}, {1L, 6L}};
+    IndexRange Ainds = {{1L, 3L}, {0L, 4L}, {0L, 5L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            for (size_t k = 0; k < Cinds[2][1] - Cinds[2][0]; k++)
+            {
+                Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                   (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])] =
+                    alpha *
+                        Av[(i + Ainds[0][0]) * Adims[1] * Adims[2] +
+                           (j + Ainds[1][0]) * Adims[2] + (k + Ainds[2][0])] +
+                    beta * Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                              (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank3_diff3()
+{
+    Dimension Cdims = {4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{2L, 4L}, {2L, 5L}, {1L, 6L}};
+    IndexRange Ainds = {{1L, 3L}, {1L, 4L}, {0L, 5L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            for (size_t k = 0; k < Cinds[2][1] - Cinds[2][0]; k++)
+            {
+                Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                   (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])] =
+                    alpha *
+                        Av[(i + Ainds[0][0]) * Adims[1] * Adims[2] +
+                           (j + Ainds[1][0]) * Adims[2] + (k + Ainds[2][0])] +
+                    beta * Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                              (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_slice_rank3_diff4()
+{
+    Dimension Cdims = {4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+
+    IndexRange Cinds = {{2L, 4L}, {2L, 5L}, {2L, 6L}};
+    IndexRange Ainds = {{1L, 3L}, {1L, 4L}, {1L, 5L}};
+
+    if (mode == 0)
+        C1.slice(A, Cinds, Ainds, alpha, beta);
+    else if (mode == 1)
+        C1(Cinds) = A(Ainds);
+    else if (mode == 2)
+        C1(Cinds) += A(Ainds);
+    else if (mode == 3)
+        C1(Cinds) -= A(Ainds);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cinds[0][1] - Cinds[0][0]; i++)
+    {
+        for (size_t j = 0; j < Cinds[1][1] - Cinds[1][0]; j++)
+        {
+            for (size_t k = 0; k < Cinds[2][1] - Cinds[2][0]; k++)
+            {
+                Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                   (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])] =
+                    alpha *
+                        Av[(i + Ainds[0][0]) * Adims[1] * Adims[2] +
+                           (j + Ainds[1][0]) * Adims[2] + (k + Ainds[2][0])] +
+                    beta * Cv[(i + Cinds[0][0]) * Cdims[1] * Cdims[2] +
+                              (j + Cinds[1][0]) * Cdims[2] + (k + Cinds[2][0])];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_slice_label_fail()
+{
+    Tensor C1 = Tensor::build(CoreTensor, "C1", {4, 5});
+    Tensor C2 = Tensor::build(CoreTensor, "C2", {4, 5});
+
+    C1((IndexRange){{0L, 4L}}) = C2();
+    return 0.0;
+}
+double try_slice_rank_fail()
+{
+    Tensor C1 = Tensor::build(CoreTensor, "C1", {4, 5});
+    Tensor C2 = Tensor::build(CoreTensor, "C2", {4});
+
+    C1() = C2();
+    return 0.0;
+}
+double try_slice_size_fail()
+{
+    Tensor C1 = Tensor::build(CoreTensor, "C1", {4, 5});
+    Tensor C2 = Tensor::build(CoreTensor, "C2", {4, 5});
+
+    C1({{0, 4}, {0, 4}}) = C2();
+    return 0.0;
+}
+double try_slice_bounds_fail()
+{
+    Tensor C1 = Tensor::build(CoreTensor, "C1", {4, 5});
+    Tensor C2 = Tensor::build(CoreTensor, "C2", {4, 5});
+
+    C1({{0, 6}, {0, 5}}) = C2({{0, 6}, {0, 5}});
+    return 0.0;
+}
+
+double try_permute_rank0()
+{
+    Dimension Cdims = {};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {}, {}, alpha, beta);
+    else if (mode == 1)
+        C1("") = A("");
+    else if (mode == 2)
+        C1("") += A("");
+    else if (mode == 3)
+        C1("") -= A("");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    Cv[0] = alpha * Av[0] + beta * Cv[0];
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank1_i()
+{
+    Dimension Cdims = {3};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i"}, {"i"}, alpha, beta);
+    else if (mode == 1)
+        C1("i") = A("i");
+    else if (mode == 2)
+        C1("i") += A("i");
+    else if (mode == 3)
+        C1("i") -= A("i");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        Cv[i] = alpha * Av[i] + beta * Cv[i];
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank2_ij()
+{
+    Dimension Cdims = {3, 4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j"}, {"i", "j"}, alpha, beta);
+    else if (mode == 1)
+        C1("ij") = A("ij");
+    else if (mode == 2)
+        C1("ij") += A("ij");
+    else if (mode == 3)
+        C1("ij") -= A("ij");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            Cv[i * Cdims[1] + j] =
+                alpha * Av[i * Adims[1] + j] + beta * Cv[i * Cdims[1] + j];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank2_ji()
+{
+    Dimension Cdims = {3, 4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 3};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j"}, {"j", "i"}, alpha, beta);
+    else if (mode == 1)
+        C1("ij") = A("ji");
+    else if (mode == 2)
+        C1("ij") += A("ji");
+    else if (mode == 3)
+        C1("ij") -= A("ji");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            Cv[i * Cdims[1] + j] =
+                alpha * Av[j * Adims[1] + i] + beta * Cv[i * Cdims[1] + j];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank3_ijk()
+{
+    Dimension Cdims = {3, 4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k"}, {"i", "j", "k"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijk") = A("ijk");
+    else if (mode == 2)
+        C1("ijk") += A("ijk");
+    else if (mode == 3)
+        C1("ijk") -= A("ijk");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k] =
+                    alpha * Av[i * Adims[1] * Adims[2] + j * Adims[2] + k] +
+                    beta * Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank3_ikj()
+{
+    Dimension Cdims = {3, 4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 5, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k"}, {"i", "k", "j"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijk") = A("ikj");
+    else if (mode == 2)
+        C1("ijk") += A("ikj");
+    else if (mode == 3)
+        C1("ijk") -= A("ikj");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k] =
+                    alpha * Av[i * Adims[1] * Adims[2] + k * Adims[2] + j] +
+                    beta * Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank3_jik()
+{
+    Dimension Cdims = {3, 4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 3, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k"}, {"j", "i", "k"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijk") = A("jik");
+    else if (mode == 2)
+        C1("ijk") += A("jik");
+    else if (mode == 3)
+        C1("ijk") -= A("jik");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k] =
+                    alpha * Av[j * Adims[1] * Adims[2] + i * Adims[2] + k] +
+                    beta * Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank3_jki()
+{
+    Dimension Cdims = {3, 4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 5, 3};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k"}, {"j", "k", "i"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijk") = A("jki");
+    else if (mode == 2)
+        C1("ijk") += A("jki");
+    else if (mode == 3)
+        C1("ijk") -= A("jki");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k] =
+                    alpha * Av[j * Adims[1] * Adims[2] + k * Adims[2] + i] +
+                    beta * Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank3_kij()
+{
+    Dimension Cdims = {3, 4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {5, 3, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k"}, {"k", "i", "j"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijk") = A("kij");
+    else if (mode == 2)
+        C1("ijk") += A("kij");
+    else if (mode == 3)
+        C1("ijk") -= A("kij");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k] =
+                    alpha * Av[k * Adims[1] * Adims[2] + i * Adims[2] + j] +
+                    beta * Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank3_kji()
+{
+    Dimension Cdims = {3, 4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {5, 4, 3};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k"}, {"k", "j", "i"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijk") = A("kji");
+    else if (mode == 2)
+        C1("ijk") += A("kji");
+    else if (mode == 3)
+        C1("ijk") -= A("kji");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k] =
+                    alpha * Av[k * Adims[1] * Adims[2] + j * Adims[2] + i] +
+                    beta * Cv[i * Cdims[1] * Cdims[2] + j * Cdims[2] + k];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank4_ijkl()
+{
+    Dimension Cdims = {3, 4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4, 5, 6};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k", "l"}, {"i", "j", "k", "l"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijkl") = A("ijkl");
+    else if (mode == 2)
+        C1("ijkl") += A("ijkl");
+    else if (mode == 3)
+        C1("ijkl") -= A("ijkl");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                for (size_t l = 0; l < Cdims[3]; l++)
+                {
+                    Cv[i * Cdims[1] * Cdims[2] * Cdims[3] +
+                       j * Cdims[2] * Cdims[3] + k * Cdims[3] + l] =
+                        alpha * Av[i * Adims[1] * Adims[2] * Adims[3] +
+                                   j * Adims[2] * Adims[3] + k * Adims[3] + l] +
+                        beta * Cv[i * Cdims[1] * Cdims[2] * Cdims[3] +
+                                  j * Cdims[2] * Cdims[3] + k * Cdims[3] + l];
+                }
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank4_ijlk()
+{
+    Dimension Cdims = {3, 4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 4, 6, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k", "l"}, {"i", "j", "l", "k"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijkl") = A("ijlk");
+    else if (mode == 2)
+        C1("ijkl") += A("ijlk");
+    else if (mode == 3)
+        C1("ijkl") -= A("ijlk");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                for (size_t l = 0; l < Cdims[3]; l++)
+                {
+                    Cv[i * Cdims[1] * Cdims[2] * Cdims[3] +
+                       j * Cdims[2] * Cdims[3] + k * Cdims[3] + l] =
+                        alpha * Av[i * Adims[1] * Adims[2] * Adims[3] +
+                                   j * Adims[2] * Adims[3] + l * Adims[3] + k] +
+                        beta * Cv[i * Cdims[1] * Cdims[2] * Cdims[3] +
+                                  j * Cdims[2] * Cdims[3] + k * Cdims[3] + l];
+                }
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank4_jikl()
+{
+    Dimension Cdims = {3, 4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 3, 5, 6};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k", "l"}, {"j", "i", "k", "l"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijkl") = A("jikl");
+    else if (mode == 2)
+        C1("ijkl") += A("jikl");
+    else if (mode == 3)
+        C1("ijkl") -= A("jikl");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                for (size_t l = 0; l < Cdims[3]; l++)
+                {
+                    Cv[i * Cdims[1] * Cdims[2] * Cdims[3] +
+                       j * Cdims[2] * Cdims[3] + k * Cdims[3] + l] =
+                        alpha * Av[j * Adims[1] * Adims[2] * Adims[3] +
+                                   i * Adims[2] * Adims[3] + k * Adims[3] + l] +
+                        beta * Cv[i * Cdims[1] * Cdims[2] * Cdims[3] +
+                                  j * Cdims[2] * Cdims[3] + k * Cdims[3] + l];
+                }
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank4_ikjl()
+{
+    Dimension Cdims = {3, 4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {3, 5, 4, 6};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k", "l"}, {"i", "k", "j", "l"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijkl") = A("ikjl");
+    else if (mode == 2)
+        C1("ijkl") += A("ikjl");
+    else if (mode == 3)
+        C1("ijkl") -= A("ikjl");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                for (size_t l = 0; l < Cdims[3]; l++)
+                {
+                    Cv[i * Cdims[1] * Cdims[2] * Cdims[3] +
+                       j * Cdims[2] * Cdims[3] + k * Cdims[3] + l] =
+                        alpha * Av[i * Adims[1] * Adims[2] * Adims[3] +
+                                   k * Adims[2] * Adims[3] + j * Adims[3] + l] +
+                        beta * Cv[i * Cdims[1] * Cdims[2] * Cdims[3] +
+                                  j * Cdims[2] * Cdims[3] + k * Cdims[3] + l];
+                }
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_rank4_lkji()
+{
+    Dimension Cdims = {3, 4, 5, 6};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {6, 5, 4, 3};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    if (mode == 0)
+        C1.permute(A, {"i", "j", "k", "l"}, {"l", "k", "j", "i"}, alpha, beta);
+    else if (mode == 1)
+        C1("ijkl") = A("lkji");
+    else if (mode == 2)
+        C1("ijkl") += A("lkji");
+    else if (mode == 3)
+        C1("ijkl") -= A("lkji");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Cv = C2.data();
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                for (size_t l = 0; l < Cdims[3]; l++)
+                {
+                    Cv[i * Cdims[1] * Cdims[2] * Cdims[3] +
+                       j * Cdims[2] * Cdims[3] + k * Cdims[3] + l] =
+                        alpha * Av[l * Adims[1] * Adims[2] * Adims[3] +
+                                   k * Adims[2] * Adims[3] + j * Adims[3] + i] +
+                        beta * Cv[i * Cdims[1] * Cdims[2] * Cdims[3] +
+                                  j * Cdims[2] * Cdims[3] + k * Cdims[3] + l];
+                }
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+
+double try_permute_label_fail()
+{
+    Dimension Cdims = {3, 4};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {3, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    C("ij") = A("i");
+
+    return 0.0;
+}
+
+double try_permute_rank_fail()
+{
+    Dimension Cdims = {3, 4};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    C("ij") = A("i");
+
+    return 0.0;
+}
+
+double try_permute_index_fail()
+{
+    Dimension Cdims = {3, 4};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {3, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    C("ij") = A("jk");
+
+    return 0.0;
+}
+
+double try_permute_size_fail()
+{
+    Dimension Cdims = {3, 4};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {3, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    C("ij") = A("ji");
+
+    return 0.0;
+}
+
+double try_contract_scalar()
+{
+    Dimension Cdims = {};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {}, {}, {}, alpha, beta);
+    else if (mode == 1)
+        C1("") = A("") * B("");
+    else if (mode == 2)
+        C1("") += A("") * B("");
+    else if (mode == 3)
+        C1("") -= A("") * B("");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    Cv[0] = alpha * Av[0] * Bv[0] + beta * Cv[0];
+
+    return relative_difference(C1, C2);
+}
+double try_contract_hadamard()
+{
+    Dimension Cdims = {4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {4};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {"i"}, {"i"}, {"i"}, alpha, beta);
+    else if (mode == 1)
+        C1("i") = A("i") * B("i");
+    else if (mode == 2)
+        C1("i") += A("i") * B("i");
+    else if (mode == 3)
+        C1("i") -= A("i") * B("i");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        Cv[i] = alpha * Av[i] * Bv[i] + beta * Cv[i];
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_contract_dot()
+{
+    Dimension Cdims = {};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {4};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {}, {"i"}, {"i"}, alpha, beta);
+    else if (mode == 1)
+        C1("") = A("i") * B("i");
+    else if (mode == 2)
+        C1("") += A("i") * B("i");
+    else if (mode == 3)
+        C1("") -= A("i") * B("i");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    Cv[0] = beta * Cv[0];
+    for (size_t i = 0; i < Adims[0]; i++)
+    {
+        Cv[0] += alpha * Av[i] * Bv[i];
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_contract_axpy1()
+{
+    Dimension Cdims = {4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {4};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {"i"}, {}, {"i"}, alpha, beta);
+    else if (mode == 1)
+        C1("i") = A("") * B("i");
+    else if (mode == 2)
+        C1("i") += A("") * B("i");
+    else if (mode == 3)
+        C1("i") -= A("") * B("i");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        Cv[i] = alpha * Av[0] * Bv[i] + beta * Cv[i];
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_contract_axpy2()
+{
+    Dimension Cdims = {4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {"i"}, {"i"}, {}, alpha, beta);
+    else if (mode == 1)
+        C1("i") = A("i") * B("");
+    else if (mode == 2)
+        C1("i") += A("i") * B("");
+    else if (mode == 3)
+        C1("i") -= A("i") * B("");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        Cv[i] = alpha * Av[i] * Bv[0] + beta * Cv[i];
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_contract_ger1()
+{
+    Dimension Cdims = {4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {"i", "j"}, {"i"}, {"j"}, alpha, beta);
+    else if (mode == 1)
+        C1("ij") = A("i") * B("j");
+    else if (mode == 2)
+        C1("ij") += A("i") * B("j");
+    else if (mode == 3)
+        C1("ij") -= A("i") * B("j");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            Cv[i * Cdims[1] + j] =
+                alpha * Av[i] * Bv[j] + beta * Cv[i * Cdims[1] + j];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_contract_ger2()
+{
+    Dimension Cdims = {4, 5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {4};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {"i", "j"}, {"j"}, {"i"}, alpha, beta);
+    else if (mode == 1)
+        C1("ij") = A("j") * B("i");
+    else if (mode == 2)
+        C1("ij") += A("j") * B("i");
+    else if (mode == 3)
+        C1("ij") -= A("j") * B("i");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            Cv[i * Cdims[1] + j] =
+                alpha * Av[j] * Bv[i] + beta * Cv[i * Cdims[1] + j];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_contract_gemv1()
+{
+    Dimension Cdims = {4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {"i"}, {"i", "j"}, {"j"}, alpha, beta);
+    else if (mode == 1)
+        C1("i") = A("ij") * B("j");
+    else if (mode == 2)
+        C1("i") += A("ij") * B("j");
+    else if (mode == 3)
+        C1("i") -= A("ij") * B("j");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Adims[0]; i++)
+    {
+        Cv[i] = beta * Cv[i];
+        for (size_t j = 0; j < Adims[1]; j++)
+        {
+            Cv[i] += alpha * Av[i * Adims[1] + j] * Bv[j];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_contract_gemv2()
+{
+    Dimension Cdims = {4};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {5, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {"i"}, {"j", "i"}, {"j"}, alpha, beta);
+    else if (mode == 1)
+        C1("i") = A("ji") * B("j");
+    else if (mode == 2)
+        C1("i") += A("ji") * B("j");
+    else if (mode == 3)
+        C1("i") -= A("ji") * B("j");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t i = 0; i < Adims[1]; i++)
+    {
+        Cv[i] = beta * Cv[i];
+        for (size_t j = 0; j < Adims[0]; j++)
+        {
+            Cv[i] += alpha * Av[j * Adims[1] + i] * Bv[j];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_contract_gemv3()
+{
+    Dimension Cdims = {5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {4, 5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {"j"}, {"i"}, {"i", "j"}, alpha, beta);
+    else if (mode == 1)
+        C1("j") = A("i") * B("ij");
+    else if (mode == 2)
+        C1("j") += A("i") * B("ij");
+    else if (mode == 3)
+        C1("j") -= A("i") * B("ij");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t j = 0; j < Bdims[1]; j++)
+    {
+        Cv[j] = beta * Cv[j];
+        for (size_t i = 0; i < Bdims[0]; i++)
+        {
+            Cv[j] += alpha * Av[i] * Bv[i * Bdims[1] + j];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_contract_gemv4()
+{
+    Dimension Cdims = {5};
+    Tensor C1 = Tensor::build(CoreTensor, "C1", Cdims);
+    Tensor C2 = Tensor::build(CoreTensor, "C2", Cdims);
+    initialize_random(C1, C2);
+
+    Dimension Adims = {4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {5, 4};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    if (mode == 0)
+        C1.contract(A, B, {"j"}, {"i"}, {"j", "i"}, alpha, beta);
+    else if (mode == 1)
+        C1("j") = A("i") * B("ji");
+    else if (mode == 2)
+        C1("j") += A("i") * B("ji");
+    else if (mode == 3)
+        C1("j") -= A("i") * B("ji");
+    else
+        throw std::runtime_error("Bad mode.");
+
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+
+    for (size_t j = 0; j < Bdims[0]; j++)
+    {
+        Cv[j] = beta * Cv[j];
+        for (size_t i = 0; i < Bdims[1]; i++)
+        {
+            Cv[j] += alpha * Av[i] * Bv[j * Bdims[1] + i];
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_C_equal_A_B(std::string c_ind, std::string a_ind, std::string b_ind,
+                       std::vector<int> c_dim, std::vector<int> a_dim,
+                       std::vector<int> b_dim)
+{
+    std::vector<size_t> dims = {3, 4, 5};
+
+    size_t ni = 3;
+    size_t nj = 4;
+    size_t nk = 5;
+
+    Tensor A = Tensor::build(CoreTensor, "A", {dims[a_dim[0]], dims[a_dim[1]]});
+    initialize_random(A);
+    Tensor B = Tensor::build(CoreTensor, "B", {dims[b_dim[0]], dims[b_dim[1]]});
+    initialize_random(B);
+    Tensor C1 =
+        Tensor::build(CoreTensor, "C1", {dims[c_dim[0]], dims[c_dim[1]]});
+    Tensor C2 =
+        Tensor::build(CoreTensor, "C2", {dims[c_dim[0]], dims[c_dim[1]]});
+    initialize_random(C1, C2);
+
+    std::vector<std::string> c_inds = {std::string(1, c_ind[0]),
+                                       std::string(1, c_ind[1])};
+    std::vector<std::string> a_inds = {std::string(1, a_ind[0]),
+                                       std::string(1, a_ind[1])};
+    std::vector<std::string> b_inds = {std::string(1, b_ind[0]),
+                                       std::string(1, b_ind[1])};
+
+    if (mode == 0)
+        C1.contract(A, B, c_inds, a_inds, b_inds, alpha, beta);
+    else if (mode == 1)
+        C1(c_ind) = A(a_ind) * B(b_ind);
+    else if (mode == 2)
+        C1(c_ind) += A(a_ind) * B(b_ind);
+    else if (mode == 3)
+        C1(c_ind) -= A(a_ind) * B(b_ind);
+    else
+        throw std::runtime_error("Bad mode.");
+
+    C2.scale(beta);
+    std::vector<double> &Av = A.data();
+    std::vector<double> &Bv = B.data();
+    std::vector<double> &Cv = C2.data();
+    std::vector<size_t> n(3);
+    for (n[0] = 0; n[0] < ni; ++n[0])
+    {
+        for (n[1] = 0; n[1] < nj; ++n[1])
+        {
+            for (n[2] = 0; n[2] < nk; ++n[2])
+            {
+                size_t aind1 = n[a_dim[0]];
+                size_t aind2 = n[a_dim[1]];
+                size_t bind1 = n[b_dim[0]];
+                size_t bind2 = n[b_dim[1]];
+                size_t cind1 = n[c_dim[0]];
+                size_t cind2 = n[c_dim[1]];
+                Cv[cind1 * dims[c_dim[1]] + cind2] +=
+                    alpha * Av[aind1 * dims[a_dim[1]] + aind2] *
+                    Bv[bind1 * dims[b_dim[1]] + bind2];
+            }
+        }
+    }
+
+    return relative_difference(C1, C2);
+}
+double try_contract_gemm1()
+{
+    return try_C_equal_A_B("ij", "ik", "jk", {0, 1}, {0, 2}, {1, 2});
+}
+double try_contract_gemm2()
+{
+    return try_C_equal_A_B("ij", "ik", "kj", {0, 1}, {0, 2}, {2, 1});
+}
+double try_contract_gemm3()
+{
+    return try_C_equal_A_B("ij", "ki", "jk", {0, 1}, {2, 0}, {1, 2});
+}
+double try_contract_gemm4()
+{
+    return try_C_equal_A_B("ij", "ki", "kj", {0, 1}, {2, 0}, {2, 1});
+}
+double try_contract_gemm5()
+{
+    return try_C_equal_A_B("ji", "ik", "jk", {1, 0}, {0, 2}, {1, 2});
+}
+double try_contract_gemm6()
+{
+    return try_C_equal_A_B("ji", "ik", "kj", {1, 0}, {0, 2}, {2, 1});
+}
+double try_contract_gemm7()
+{
+    return try_C_equal_A_B("ji", "ki", "jk", {1, 0}, {2, 0}, {1, 2});
+}
+double try_contract_gemm8()
+{
+    return try_C_equal_A_B("ji", "ki", "kj", {1, 0}, {2, 0}, {2, 1});
+}
+double try_contract_label_fail()
+{
+    Dimension Cdims = {3, 4};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {3, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {4, 5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    C("i") = A("ik") * B("jk");
+
+    return 0.0;
+}
+double try_contract_einsum_fail1()
+{
+    Dimension Cdims = {3};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {3, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {4, 5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    C("i") = A("ik") * B("jk");
+
+    return 0.0;
+}
+double try_contract_einsum_fail2()
+{
+    Dimension Cdims = {3};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {3, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {4, 5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    C("ij") = A("ji") * B("jj");
+
+    return 0.0;
+}
+double try_contract_size_fail1()
+{
+    Dimension Cdims = {2, 2, 4};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {2, 3, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {2, 4, 5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    C("Pij") = A("Pik") * B("Pjk");
+
+    return 0.0;
+}
+double try_contract_size_fail2()
+{
+    Dimension Cdims = {2, 3, 3};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {2, 3, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {2, 4, 5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    C("Pij") = A("Pik") * B("Pjk");
+
+    return 0.0;
+}
+double try_contract_size_fail3()
+{
+    Dimension Cdims = {2, 3, 4};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {2, 3, 4};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {2, 4, 5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    C("Pij") = A("Pik") * B("Pjk");
+
+    return 0.0;
+}
+double try_contract_size_fail4()
+{
+    Dimension Cdims = {1, 3, 4};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    initialize_random(C);
+
+    Dimension Adims = {2, 3, 5};
+    Tensor A = Tensor::build(CoreTensor, "A", Adims);
+    initialize_random(A);
+
+    Dimension Bdims = {2, 4, 5};
+    Tensor B = Tensor::build(CoreTensor, "B", Bdims);
+    initialize_random(B);
+
+    C("Pij") = A("Pik") * B("Pjk");
+
+    return 0.0;
+}
+double try_get()
+{
+    double error = 0.0;
+    Dimension Cdims = {1, 3, 4};
+    Tensor C = Tensor::build(CoreTensor, "C", Cdims);
+    double index = 0.0;
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                C.at({i, j, k}) = index;
+                index += 1.0;
+            }
+        }
+    }
+    index = 0.0;
+    for (size_t i = 0; i < Cdims[0]; i++)
+    {
+        for (size_t j = 0; j < Cdims[1]; j++)
+        {
+            for (size_t k = 0; k < Cdims[2]; k++)
+            {
+                const double val = C.at({i, j, k});
+                error += std::fabs(val - index);
+                index += 1.0;
+            }
+        }
+    }
+
+    return error;
+}
+
+int main(int argc, char *argv[])
+{
+    printf(ANSI_COLOR_RESET);
+    srand(time(NULL));
+    ambit::initialize(argc, argv);
+
+    bool success;
+
+    printf("==> Simple Operations <==\n\n");
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("%-50s %-9s %-9s %11s\n", "Description", "Expected", "Observed",
+           "Delta");
+    printf("%s\n", std::string(82, '-').c_str());
+    success = true;
+    success &=
+        test_function(try_relative_difference, "Relative Difference", kExact);
+    success &= test_function(try_1_norm, "1-Norm", kEpsilon);
+    success &= test_function(try_2_norm, "2-Norm", kEpsilon);
+    success &= test_function(try_inf_norm, "Inf-Norm", kEpsilon);
+    success &= test_function(try_zero, "Zero", kExact);
+    success &= test_function(try_copy, "Copy", kExact);
+    success &= test_function(try_scale, "Scale", kExact);
+    success &= test_function(try_get, "Get/Set", kExact);
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Tests: %s\n\n", success ? "All passed" : "Some failed");
+
+    printf("==> Slice Operations <==\n\n");
+    success = true;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("%-50s %-9s %-9s %11s\n", "Description", "Expected", "Observed",
+           "Delta");
+    mode = 0;
+    alpha = 1.0;
+    beta = 0.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+    printf("%s\n", std::string(82, '-').c_str());
+    success &=
+        test_function(try_slice_rank0_same1, "Slice Rank-0 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank1_same1, "Slice Rank-1 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank1_same2, "Slice Rank-1 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank1_diff1, "Slice Rank-1 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank1_diff2, "Slice Rank-1 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank2_same1, "Slice Rank-2 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank2_same2, "Slice Rank-2 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank2_same3, "Slice Rank-2 Same 3", kExact);
+    success &=
+        test_function(try_slice_rank2_diff1, "Slice Rank-2 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank2_diff2, "Slice Rank-2 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank2_diff3, "Slice Rank-2 Diff 3", kExact);
+    success &=
+        test_function(try_slice_rank3_same1, "Slice Rank-3 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank3_same2, "Slice Rank-3 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank3_same3, "Slice Rank-3 Same 3", kExact);
+    success &=
+        test_function(try_slice_rank3_same4, "Slice Rank-3 Same 4", kExact);
+    success &=
+        test_function(try_slice_rank3_diff1, "Slice Rank-3 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank3_diff2, "Slice Rank-3 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank3_diff3, "Slice Rank-3 Diff 3", kExact);
+    success &=
+        test_function(try_slice_rank3_diff4, "Slice Rank-3 Diff 4", kExact);
+    mode = 0;
+    alpha = random_double();
+    beta = random_double();
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+    printf("%s\n", std::string(82, '-').c_str());
+    success &=
+        test_function(try_slice_rank0_same1, "Slice Rank-0 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank1_same1, "Slice Rank-1 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank1_same2, "Slice Rank-1 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank1_diff1, "Slice Rank-1 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank1_diff2, "Slice Rank-1 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank2_same1, "Slice Rank-2 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank2_same2, "Slice Rank-2 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank2_same3, "Slice Rank-2 Same 3", kExact);
+    success &=
+        test_function(try_slice_rank2_diff1, "Slice Rank-2 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank2_diff2, "Slice Rank-2 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank2_diff3, "Slice Rank-2 Diff 3", kExact);
+    success &=
+        test_function(try_slice_rank3_same1, "Slice Rank-3 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank3_same2, "Slice Rank-3 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank3_same3, "Slice Rank-3 Same 3", kExact);
+    success &=
+        test_function(try_slice_rank3_same4, "Slice Rank-3 Same 4", kExact);
+    success &=
+        test_function(try_slice_rank3_diff1, "Slice Rank-3 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank3_diff2, "Slice Rank-3 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank3_diff3, "Slice Rank-3 Diff 3", kExact);
+    success &=
+        test_function(try_slice_rank3_diff4, "Slice Rank-3 Diff 4", kExact);
+    mode = 1;
+    alpha = 1.0;
+    beta = 0.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Operator Overloading: =\n");
+    printf("%s\n", std::string(82, '-').c_str());
+    success &=
+        test_function(try_slice_rank0_same1, "Slice Rank-0 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank1_same1, "Slice Rank-1 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank1_same2, "Slice Rank-1 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank1_diff1, "Slice Rank-1 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank1_diff2, "Slice Rank-1 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank2_same1, "Slice Rank-2 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank2_same2, "Slice Rank-2 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank2_same3, "Slice Rank-2 Same 3", kExact);
+    success &=
+        test_function(try_slice_rank2_diff1, "Slice Rank-2 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank2_diff2, "Slice Rank-2 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank2_diff3, "Slice Rank-2 Diff 3", kExact);
+    success &=
+        test_function(try_slice_rank3_same1, "Slice Rank-3 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank3_same2, "Slice Rank-3 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank3_same3, "Slice Rank-3 Same 3", kExact);
+    success &=
+        test_function(try_slice_rank3_same4, "Slice Rank-3 Same 4", kExact);
+    success &=
+        test_function(try_slice_rank3_diff1, "Slice Rank-3 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank3_diff2, "Slice Rank-3 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank3_diff3, "Slice Rank-3 Diff 3", kExact);
+    success &=
+        test_function(try_slice_rank3_diff4, "Slice Rank-3 Diff 4", kExact);
+    mode = 2;
+    alpha = 1.0;
+    beta = 1.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Operator Overloading: +=\n");
+    printf("%s\n", std::string(82, '-').c_str());
+    success &=
+        test_function(try_slice_rank0_same1, "Slice Rank-0 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank1_same1, "Slice Rank-1 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank1_same2, "Slice Rank-1 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank1_diff1, "Slice Rank-1 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank1_diff2, "Slice Rank-1 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank2_same1, "Slice Rank-2 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank2_same2, "Slice Rank-2 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank2_same3, "Slice Rank-2 Same 3", kExact);
+    success &=
+        test_function(try_slice_rank2_diff1, "Slice Rank-2 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank2_diff2, "Slice Rank-2 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank2_diff3, "Slice Rank-2 Diff 3", kExact);
+    success &=
+        test_function(try_slice_rank3_same1, "Slice Rank-3 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank3_same2, "Slice Rank-3 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank3_same3, "Slice Rank-3 Same 3", kExact);
+    success &=
+        test_function(try_slice_rank3_same4, "Slice Rank-3 Same 4", kExact);
+    success &=
+        test_function(try_slice_rank3_diff1, "Slice Rank-3 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank3_diff2, "Slice Rank-3 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank3_diff3, "Slice Rank-3 Diff 3", kExact);
+    success &=
+        test_function(try_slice_rank3_diff4, "Slice Rank-3 Diff 4", kExact);
+    mode = 3;
+    alpha = -1.0;
+    beta = 1.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Operator Overloading: -=\n");
+    printf("%s\n", std::string(82, '-').c_str());
+    success &=
+        test_function(try_slice_rank0_same1, "Slice Rank-0 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank1_same1, "Slice Rank-1 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank1_same2, "Slice Rank-1 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank1_diff1, "Slice Rank-1 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank1_diff2, "Slice Rank-1 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank2_same1, "Slice Rank-2 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank2_same2, "Slice Rank-2 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank2_same3, "Slice Rank-2 Same 3", kExact);
+    success &=
+        test_function(try_slice_rank2_diff1, "Slice Rank-2 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank2_diff2, "Slice Rank-2 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank2_diff3, "Slice Rank-2 Diff 3", kExact);
+    success &=
+        test_function(try_slice_rank3_same1, "Slice Rank-3 Same 1", kExact);
+    success &=
+        test_function(try_slice_rank3_same2, "Slice Rank-3 Same 2", kExact);
+    success &=
+        test_function(try_slice_rank3_same3, "Slice Rank-3 Same 3", kExact);
+    success &=
+        test_function(try_slice_rank3_same4, "Slice Rank-3 Same 4", kExact);
+    success &=
+        test_function(try_slice_rank3_diff1, "Slice Rank-3 Diff 1", kExact);
+    success &=
+        test_function(try_slice_rank3_diff2, "Slice Rank-3 Diff 2", kExact);
+    success &=
+        test_function(try_slice_rank3_diff3, "Slice Rank-3 Diff 3", kExact);
+    success &=
+        test_function(try_slice_rank3_diff4, "Slice Rank-3 Diff 4", kExact);
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Tests: %s\n\n", success ? "All Passed" : "Some Failed");
+
+    printf("==> Slice Exceptions <==\n\n");
+    success = true;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("%-50s %-9s %-9s %11s\n", "Description", "Expected", "Observed",
+           "Delta");
+    mode = 0;
+    alpha = 1.0;
+    beta = 0.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+    printf("%s\n", std::string(82, '-').c_str());
+    success &=
+        test_function(try_slice_label_fail, "Slice Label Fail", kException);
+    success &=
+        test_function(try_slice_rank_fail, "Slice Rank Fail", kException);
+    success &=
+        test_function(try_slice_size_fail, "Slice Size Fail", kException);
+    success &=
+        test_function(try_slice_bounds_fail, "Slice Bounds Fail", kException);
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Tests: %s\n\n", success ? "All Passed" : "Some Failed");
+
+    printf("==> Permute Operations <==\n\n");
+    success = true;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("%-50s %-9s %-9s %11s\n", "Description", "Expected", "Observed",
+           "Delta");
+    mode = 0;
+    alpha = 1.0;
+    beta = 0.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_permute_rank0, "Permute Rank-0", kExact);
+    success &= test_function(try_permute_rank1_i, "Permute Rank-1 i", kExact);
+    success &= test_function(try_permute_rank2_ij, "Permute Rank-2 ij", kExact);
+    success &= test_function(try_permute_rank2_ji, "Permute Rank-2 ji", kExact);
+    success &=
+        test_function(try_permute_rank3_ijk, "Permute Rank-3 ijk", kExact);
+    success &=
+        test_function(try_permute_rank3_ikj, "Permute Rank-3 ikj", kExact);
+    success &=
+        test_function(try_permute_rank3_jik, "Permute Rank-3 jik", kExact);
+    success &=
+        test_function(try_permute_rank3_jki, "Permute Rank-3 jki", kExact);
+    success &=
+        test_function(try_permute_rank3_kij, "Permute Rank-3 kij", kExact);
+    success &=
+        test_function(try_permute_rank3_kji, "Permute Rank-3 kji", kExact);
+    success &=
+        test_function(try_permute_rank4_ijkl, "Permute Rank-4 ijkl", kExact);
+    success &=
+        test_function(try_permute_rank4_lkji, "Permute Rank-4 lkji", kExact);
+    success &=
+        test_function(try_permute_rank4_ijlk, "Permute Rank-4 ijlk", kExact);
+    success &=
+        test_function(try_permute_rank4_jikl, "Permute Rank-4 jikl", kExact);
+    success &=
+        test_function(try_permute_rank4_ikjl, "Permute Rank-4 ikjl", kExact);
+    success &=
+        test_function(try_permute_rank4_lkji, "Permute Rank-4 lkji", kExact);
+    mode = 0;
+    alpha = random_double();
+    beta = random_double();
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_permute_rank0, "Permute Rank-0", kExact);
+    success &= test_function(try_permute_rank1_i, "Permute Rank-1 i", kExact);
+    success &= test_function(try_permute_rank2_ij, "Permute Rank-2 ij", kExact);
+    success &= test_function(try_permute_rank2_ji, "Permute Rank-2 ji", kExact);
+    success &=
+        test_function(try_permute_rank3_ijk, "Permute Rank-3 ijk", kExact);
+    success &=
+        test_function(try_permute_rank3_ikj, "Permute Rank-3 ikj", kExact);
+    success &=
+        test_function(try_permute_rank3_jik, "Permute Rank-3 jik", kExact);
+    success &=
+        test_function(try_permute_rank3_jki, "Permute Rank-3 jki", kExact);
+    success &=
+        test_function(try_permute_rank3_kij, "Permute Rank-3 kij", kExact);
+    success &=
+        test_function(try_permute_rank3_kji, "Permute Rank-3 kji", kExact);
+    success &=
+        test_function(try_permute_rank4_ijkl, "Permute Rank-4 ijkl", kExact);
+    success &=
+        test_function(try_permute_rank4_ijlk, "Permute Rank-4 ijlk", kExact);
+    success &=
+        test_function(try_permute_rank4_jikl, "Permute Rank-4 jikl", kExact);
+    success &=
+        test_function(try_permute_rank4_ikjl, "Permute Rank-4 ikjl", kExact);
+    success &=
+        test_function(try_permute_rank4_lkji, "Permute Rank-4 lkji", kExact);
+    mode = 1;
+    alpha = 1.0;
+    beta = 0.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Operator Overloading: =\n");
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_permute_rank0, "Permute Rank-0", kExact);
+    success &= test_function(try_permute_rank1_i, "Permute Rank-1 i", kExact);
+    success &= test_function(try_permute_rank2_ij, "Permute Rank-2 ij", kExact);
+    success &= test_function(try_permute_rank2_ji, "Permute Rank-2 ji", kExact);
+    success &=
+        test_function(try_permute_rank3_ijk, "Permute Rank-3 ijk", kExact);
+    success &=
+        test_function(try_permute_rank3_ikj, "Permute Rank-3 ikj", kExact);
+    success &=
+        test_function(try_permute_rank3_jik, "Permute Rank-3 jik", kExact);
+    success &=
+        test_function(try_permute_rank3_jki, "Permute Rank-3 jki", kExact);
+    success &=
+        test_function(try_permute_rank3_kij, "Permute Rank-3 kij", kExact);
+    success &=
+        test_function(try_permute_rank3_kji, "Permute Rank-3 kji", kExact);
+    success &=
+        test_function(try_permute_rank4_ijkl, "Permute Rank-4 ijkl", kExact);
+    success &=
+        test_function(try_permute_rank4_ijlk, "Permute Rank-4 ijlk", kExact);
+    success &=
+        test_function(try_permute_rank4_jikl, "Permute Rank-4 jikl", kExact);
+    success &=
+        test_function(try_permute_rank4_ikjl, "Permute Rank-4 ikjl", kExact);
+    success &=
+        test_function(try_permute_rank4_lkji, "Permute Rank-4 lkji", kExact);
+    mode = 2;
+    alpha = 1.0;
+    beta = 1.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Operator Overloading: +=\n");
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_permute_rank0, "Permute Rank-0", kExact);
+    success &= test_function(try_permute_rank1_i, "Permute Rank-1 i", kExact);
+    success &= test_function(try_permute_rank2_ij, "Permute Rank-2 ij", kExact);
+    success &= test_function(try_permute_rank2_ji, "Permute Rank-2 ji", kExact);
+    success &=
+        test_function(try_permute_rank3_ijk, "Permute Rank-3 ijk", kExact);
+    success &=
+        test_function(try_permute_rank3_ikj, "Permute Rank-3 ikj", kExact);
+    success &=
+        test_function(try_permute_rank3_jik, "Permute Rank-3 jik", kExact);
+    success &=
+        test_function(try_permute_rank3_jki, "Permute Rank-3 jki", kExact);
+    success &=
+        test_function(try_permute_rank3_kij, "Permute Rank-3 kij", kExact);
+    success &=
+        test_function(try_permute_rank3_kji, "Permute Rank-3 kji", kExact);
+    success &=
+        test_function(try_permute_rank4_ijkl, "Permute Rank-4 ijkl", kExact);
+    success &=
+        test_function(try_permute_rank4_ijlk, "Permute Rank-4 ijlk", kExact);
+    success &=
+        test_function(try_permute_rank4_jikl, "Permute Rank-4 jikl", kExact);
+    success &=
+        test_function(try_permute_rank4_ikjl, "Permute Rank-4 ikjl", kExact);
+    success &=
+        test_function(try_permute_rank4_lkji, "Permute Rank-4 lkji", kExact);
+    mode = 3;
+    alpha = -1.0;
+    beta = 1.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Operator Overloading: -=\n");
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_permute_rank0, "Permute Rank-0", kExact);
+    success &= test_function(try_permute_rank1_i, "Permute Rank-1 i", kExact);
+    success &= test_function(try_permute_rank2_ij, "Permute Rank-2 ij", kExact);
+    success &= test_function(try_permute_rank2_ji, "Permute Rank-2 ji", kExact);
+    success &=
+        test_function(try_permute_rank3_ijk, "Permute Rank-3 ijk", kExact);
+    success &=
+        test_function(try_permute_rank3_ikj, "Permute Rank-3 ikj", kExact);
+    success &=
+        test_function(try_permute_rank3_jik, "Permute Rank-3 jik", kExact);
+    success &=
+        test_function(try_permute_rank3_jki, "Permute Rank-3 jki", kExact);
+    success &=
+        test_function(try_permute_rank3_kij, "Permute Rank-3 kij", kExact);
+    success &=
+        test_function(try_permute_rank3_kji, "Permute Rank-3 kji", kExact);
+    success &=
+        test_function(try_permute_rank4_ijkl, "Permute Rank-4 ijkl", kExact);
+    success &=
+        test_function(try_permute_rank4_ijlk, "Permute Rank-4 ijlk", kExact);
+    success &=
+        test_function(try_permute_rank4_jikl, "Permute Rank-4 jikl", kExact);
+    success &=
+        test_function(try_permute_rank4_ikjl, "Permute Rank-4 ikjl", kExact);
+    success &=
+        test_function(try_permute_rank4_lkji, "Permute Rank-4 lkji", kExact);
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Tests: %s\n\n", success ? "All Passed" : "Some Failed");
+
+    printf("==> Permute Exceptions <==\n\n");
+    success = true;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("%-50s %-9s %-9s %11s\n", "Description", "Expected", "Observed",
+           "Delta");
+    mode = 0;
+    alpha = 1.0;
+    beta = 0.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+    printf("%s\n", std::string(82, '-').c_str());
+    success &=
+        test_function(try_permute_label_fail, "Permute Label Fail", kException);
+    success &=
+        test_function(try_permute_rank_fail, "Permute Rank Fail", kException);
+    success &=
+        test_function(try_permute_index_fail, "Permute Index Fail", kException);
+    success &=
+        test_function(try_permute_size_fail, "Permute Size Fail", kException);
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Tests: %s\n\n", success ? "All Passed" : "Some Failed");
+
+    printf("==> Contract Operations <==\n\n");
+    success = true;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("%-50s %-9s %-9s %11s\n", "Description", "Expected", "Observed",
+           "Delta");
+    mode = 0;
+    alpha = 1.0;
+    beta = 0.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
+    success &=
+        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
+    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
+//    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
+//    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
+//    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
+//    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
+//    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
+//    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
+//    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
+//    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
+//    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
+    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
+//    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
+//    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
+//    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
+//    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
+//    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
+//    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
+//    mode = 0;
+//    alpha = random_double();
+//    beta = random_double();
+//    printf("%s\n", std::string(82, '-').c_str());
+//    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+//    printf("%s\n", std::string(82, '-').c_str());
+//    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
+//    success &=
+//        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
+//    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
+//    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
+//    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
+//    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
+//    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
+//    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
+//    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
+//    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
+//    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
+//    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
+//    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
+//    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
+//    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
+//    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
+//    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
+//    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
+//    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
+//    mode = 1;
+//    alpha = 1.0;
+//    beta = 0.0;
+//    printf("%s\n", std::string(82, '-').c_str());
+//    printf("Operator Overloading: =\n");
+//    printf("%s\n", std::string(82, '-').c_str());
+//    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
+//    success &=
+//        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
+//    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
+//    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
+//    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
+//    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
+//    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
+//    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
+//    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
+//    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
+//    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
+//    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
+//    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
+//    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
+//    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
+//    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
+//    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
+//    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
+//    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
+//    mode = 2;
+//    alpha = 1.0;
+//    beta = 1.0;
+//    printf("%s\n", std::string(82, '-').c_str());
+//    printf("Operator Overloading: +=\n");
+//    printf("%s\n", std::string(82, '-').c_str());
+//    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
+//    success &=
+//        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
+//    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
+//    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
+//    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
+//    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
+//    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
+//    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
+//    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
+//    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
+//    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
+//    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
+//    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
+//    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
+//    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
+//    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
+//    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
+//    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
+//    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
+//    mode = 3;
+//    alpha = -1.0;
+//    beta = 1.0;
+//    printf("%s\n", std::string(82, '-').c_str());
+//    printf("Operator Overloading: -=\n");
+//    printf("%s\n", std::string(82, '-').c_str());
+//    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
+//    success &=
+//        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
+//    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
+//    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
+//    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
+//    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
+//    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
+//    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
+//    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
+//    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
+//    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
+//    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
+//    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
+//    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
+//    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
+//    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
+//    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
+//    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
+//    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
+//    printf("%s\n", std::string(82, '-').c_str());
+//    printf("Tests: %s\n\n", success ? "All Passed" : "Some Failed");
+
+//    printf("==> Contract Exceptions <==\n\n");
+//    success = true;
+//    printf("%s\n", std::string(82, '-').c_str());
+//    printf("%-50s %-9s %-9s %11s\n", "Description", "Expected", "Observed",
+//           "Delta");
+//    mode = 0;
+//    alpha = 1.0;
+//    beta = 0.0;
+//    printf("%s\n", std::string(82, '-').c_str());
+//    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+//    printf("%s\n", std::string(82, '-').c_str());
+//    success &= test_function(try_contract_label_fail, "Contract Label Fail",
+//                             kException);
+//    success &= test_function(try_contract_einsum_fail1,
+//                             "Contract Einsum Fail 1", kException);
+//    success &= test_function(try_contract_einsum_fail2,
+//                             "Contract Einsum Fail 2", kException);
+//    success &= test_function(try_contract_size_fail1, "Contract Size Fail 1",
+//                             kException);
+//    success &= test_function(try_contract_size_fail2, "Contract Size Fail 2",
+//                             kException);
+//    success &= test_function(try_contract_size_fail3, "Contract Size Fail 3",
+//                             kException);
+//    success &= test_function(try_contract_size_fail4, "Contract Size Fail 4",
+//                             kException);
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Tests: %s\n\n", success ? "All Passed" : "Some Failed");
+
+    ambit::finalize();
+
+    return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
+//#include <iostream>
+//#include <vector>
+
+// using namespace std;
+
+//#include <tblis/tblis.h>
+
+// using std::cout;
+// using std::endl;
+// using std::vector;
+
+// int main(int argc, char **argv)
+//{
+
+//    int d1 = 4, d2 = 4, d3 = 4, d4 = 4;
+//    vector<tblis::label_type> la = {1, 2};
+//    vector<tblis::label_type> lb = {2, 3};
+//    vector<tblis::label_type> lc = {1, 3};
+//    tblis::tensor<double> A({d1, d2});
+//    tblis::tensor<double> B({d2, d3});
+//    tblis::tensor<double> C({d1, d4});
+
+////    std::vector<double> C_vec(16, 0.0);
+////    MArray::varray_view<double> D({4, 4}, &C_vec.data()[0]);
+////    //  std::vector<long> d5{2,3,2,3};
+//    //  tblis::tensor<double> C({d1,d2});
+
+//    A = 1.0;
+//    B = 0.2;
+//    std::cout << A;
+//    std::cout << B;
+
+//    double beta = 0.0;
+//    double alpha = 1.0;
+//    //  tblis::mult<double>(alpha, A, la.data(), B, lb.data(), beta, C,
+//    //  lc.data());
+//    //  tblis::mult<double>(alpha, A, la.data(), B, lb.data(), beta, C,
+//    //  lc.data());
+//    tblis::mult<double>(alpha, A, "a1,b2", B, "b2,c", beta, C, "a1,c");
+////    for (auto c : C_vec)
+////        std::cout << " " << c;
+//}

From 65af943cdf61ca97fa302f800844f7636d150482 Mon Sep 17 00:00:00 2001
From: Francesco Evangelista <francesco.evangelista@gmail.com>
Date: Thu, 25 Jun 2020 00:30:58 -0400
Subject: [PATCH 02/10] Implement and test gemm

---
 src/tensor/core/core.cc |  30 +++++---
 test/test_tblis.cc      | 158 ++++++++++++++++++++--------------------
 2 files changed, 100 insertions(+), 88 deletions(-)

diff --git a/src/tensor/core/core.cc b/src/tensor/core/core.cc
index 5cd2f9e..30793a8 100644
--- a/src/tensor/core/core.cc
+++ b/src/tensor/core/core.cc
@@ -172,13 +172,25 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
     size_t Bsize = Binds.size();
     size_t Csize = Cinds.size();
 
-    std::string indices_A = indices::to_string(Ainds, ",");
-    std::string indices_B = indices::to_string(Binds, ",");
-    std::string indices_C = indices::to_string(Cinds, ",");
-
-    std::cout << "\n indices_A " << indices_A << std::endl;
-    std::cout << "\n indices_B " << indices_B << std::endl;
-    std::cout << "\n indices_C " << indices_C << std::endl;
+    std::string indices_A = indices::to_string(Ainds, "");
+    std::string indices_B = indices::to_string(Binds, "");
+    std::string indices_C = indices::to_string(Cinds, "");
+
+    std::cout << "\n indices_A " << indices_A;
+    auto Adims = A->dims();
+    for (auto d : Adims)
+        std::cout << " " << d;
+    std::cout << std::endl;
+    std::cout << "\n indices_B " << indices_B;
+    auto Bdims = B->dims();
+    for (auto d : Bdims)
+        std::cout << " " << d;
+    std::cout << std::endl;
+    std::cout << "\n indices_C " << indices_C;
+    auto Cdims = this->dims();
+    for (auto d : Cdims)
+        std::cout << " " << d;
+    std::cout << std::endl;
 
     // Scalar multiplication
     if ((Asize == 0) and (Bsize == 0) and (Csize == 0))
@@ -196,7 +208,7 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
                                               indices_B.c_str()) +
                    beta * data_[0];
     }
-    // Index Type ABB -> multp operation
+    // Index Type ABC -> multp operation
     else if ((Ainds.size() > 0) and (Binds.size() > 0) and (Cinds.size() > 0))
     {
         std::cout << "\n Calling mult " << std::endl;
@@ -210,7 +222,7 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
         MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
         std::cout << "\n Get C_v " << std::endl;
         MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
-        std::cout << "\n CAll mult " << std::endl;
+        std::cout << "\n Call mult " << std::endl;
         tblis::mult<double>(alpha, A_v, indices_A.c_str(), B_v,
                             indices_B.c_str(), beta, C_v, indices_C.c_str());
     }
diff --git a/test/test_tblis.cc b/test/test_tblis.cc
index 628fa96..83d71e2 100644
--- a/test/test_tblis.cc
+++ b/test/test_tblis.cc
@@ -2962,24 +2962,24 @@ int main(int argc, char *argv[])
 //    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
 //    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
 //    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
-//    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
+    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
     success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
-//    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
-//    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
-//    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
-//    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
-//    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
-//    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
-//    mode = 0;
-//    alpha = random_double();
-//    beta = random_double();
-//    printf("%s\n", std::string(82, '-').c_str());
-//    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
-//    printf("%s\n", std::string(82, '-').c_str());
-//    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
-//    success &=
-//        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
-//    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
+    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
+    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
+    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
+    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
+    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
+    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
+    mode = 0;
+    alpha = random_double();
+    beta = random_double();
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
+    success &=
+        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
+    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
 //    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
 //    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
 //    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
@@ -2988,24 +2988,24 @@ int main(int argc, char *argv[])
 //    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
 //    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
 //    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
-//    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
-//    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
-//    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
-//    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
-//    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
-//    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
-//    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
-//    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
-//    mode = 1;
-//    alpha = 1.0;
-//    beta = 0.0;
-//    printf("%s\n", std::string(82, '-').c_str());
-//    printf("Operator Overloading: =\n");
-//    printf("%s\n", std::string(82, '-').c_str());
-//    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
-//    success &=
-//        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
-//    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
+    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
+    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
+    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
+    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
+    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
+    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
+    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
+    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
+    mode = 1;
+    alpha = 1.0;
+    beta = 0.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Operator Overloading: =\n");
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
+    success &=
+        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
+    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
 //    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
 //    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
 //    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
@@ -3014,24 +3014,24 @@ int main(int argc, char *argv[])
 //    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
 //    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
 //    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
-//    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
-//    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
-//    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
-//    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
-//    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
-//    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
-//    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
-//    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
-//    mode = 2;
-//    alpha = 1.0;
-//    beta = 1.0;
-//    printf("%s\n", std::string(82, '-').c_str());
-//    printf("Operator Overloading: +=\n");
-//    printf("%s\n", std::string(82, '-').c_str());
-//    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
-//    success &=
-//        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
-//    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
+    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
+    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
+    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
+    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
+    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
+    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
+    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
+    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
+    mode = 2;
+    alpha = 1.0;
+    beta = 1.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Operator Overloading: +=\n");
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
+    success &=
+        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
+    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
 //    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
 //    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
 //    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
@@ -3040,24 +3040,24 @@ int main(int argc, char *argv[])
 //    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
 //    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
 //    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
-//    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
-//    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
-//    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
-//    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
-//    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
-//    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
-//    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
-//    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
-//    mode = 3;
-//    alpha = -1.0;
-//    beta = 1.0;
-//    printf("%s\n", std::string(82, '-').c_str());
-//    printf("Operator Overloading: -=\n");
-//    printf("%s\n", std::string(82, '-').c_str());
-//    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
-//    success &=
-//        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
-//    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
+    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
+    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
+    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
+    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
+    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
+    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
+    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
+    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
+    mode = 3;
+    alpha = -1.0;
+    beta = 1.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Operator Overloading: -=\n");
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_contract_scalar, "Contract scalar", kEpsilon);
+    success &=
+        test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
+    success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
 //    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
 //    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
 //    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
@@ -3066,14 +3066,14 @@ int main(int argc, char *argv[])
 //    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
 //    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
 //    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
-//    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
-//    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
-//    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
-//    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
-//    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
-//    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
-//    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
-//    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
+    success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
+    success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
+    success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
+    success &= test_function(try_contract_gemm4, "Contract gemm 4", kEpsilon);
+    success &= test_function(try_contract_gemm5, "Contract gemm 5", kEpsilon);
+    success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
+    success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
+    success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
 //    printf("%s\n", std::string(82, '-').c_str());
 //    printf("Tests: %s\n\n", success ? "All Passed" : "Some Failed");
 

From 843d913329097f7cfde4dd18b3760129396329b6 Mon Sep 17 00:00:00 2001
From: Francesco Evangelista <francesco.evangelista@gmail.com>
Date: Thu, 25 Jun 2020 01:05:02 -0400
Subject: [PATCH 03/10] Most tests pass

---
 src/tensor/core/core.cc |  39 +++++++++++---
 test/test_tblis.cc      | 112 ++++++++++++++++++++--------------------
 2 files changed, 87 insertions(+), 64 deletions(-)

diff --git a/src/tensor/core/core.cc b/src/tensor/core/core.cc
index 30793a8..3957ed0 100644
--- a/src/tensor/core/core.cc
+++ b/src/tensor/core/core.cc
@@ -166,8 +166,8 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
     shared_ptr<TensorImpl> A2;
     shared_ptr<TensorImpl> B2;
     shared_ptr<TensorImpl> C2;
+#define TBLIS_DEBUG 0
 #ifdef HAVE_TBLIS
-    std::cout << "Calling TBLIS" << std::endl;
     size_t Asize = Ainds.size();
     size_t Bsize = Binds.size();
     size_t Csize = Cinds.size();
@@ -176,6 +176,8 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
     std::string indices_B = indices::to_string(Binds, "");
     std::string indices_C = indices::to_string(Cinds, "");
 
+#if TBLIS_DEBUG
+    std::cout << "Calling TBLIS" << std::endl;
     std::cout << "\n indices_A " << indices_A;
     auto Adims = A->dims();
     for (auto d : Adims)
@@ -191,14 +193,33 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
     for (auto d : Cdims)
         std::cout << " " << d;
     std::cout << std::endl;
+#endif
 
     // Scalar multiplication
     if ((Asize == 0) and (Bsize == 0) and (Csize == 0))
     {
         data_[0] = alpha * A->data()[0] * B->data()[0] + beta * data_[0];
     }
+    // Index Type A -> add operation
+    else if ((Asize == Csize) and (Bsize == 0))
+    {
+        TensorImplPtr Ap = const_cast<TensorImplPtr>(A);
+        MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
+        MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
+        tblis::add<double>(alpha * B->data()[0], A_v, indices_A.c_str(), beta,
+                           C_v, indices_C.c_str());
+    }
+    // Index Type B -> add operation
+    else if ((Asize == 0) and (Bsize == Csize))
+    {
+        TensorImplPtr Bp = const_cast<TensorImplPtr>(B);
+        MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
+        MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
+        tblis::add<double>(alpha * A->data()[0], B_v, indices_B.c_str(), beta,
+                           C_v, indices_C.c_str());
+    }
     // Index Type AB -> dot operation
-    else if ((Cinds.size() == 0) and (Ainds.size() > 0) and (Binds.size() > 0))
+    else if ((Asize > 0) and (Bsize > 0) and (Csize == 0))
     {
         TensorImplPtr Ap = const_cast<TensorImplPtr>(A);
         TensorImplPtr Bp = const_cast<TensorImplPtr>(B);
@@ -209,20 +230,22 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
                    beta * data_[0];
     }
     // Index Type ABC -> multp operation
-    else if ((Ainds.size() > 0) and (Binds.size() > 0) and (Cinds.size() > 0))
+    else if ((Asize > 0) and (Bsize > 0) and (Csize > 0))
     {
+#if TBLIS_DEBUG
         std::cout << "\n Calling mult " << std::endl;
         std::cout << "\n Get Ap " << std::endl;
-        TensorImplPtr Ap = const_cast<TensorImplPtr>(A);
         std::cout << "\n Get Bp " << std::endl;
-        TensorImplPtr Bp = const_cast<TensorImplPtr>(B);
         std::cout << "\n Get A_v " << std::endl;
-        MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
         std::cout << "\n Get B_v " << std::endl;
-        MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
         std::cout << "\n Get C_v " << std::endl;
-        MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
         std::cout << "\n Call mult " << std::endl;
+#endif
+        TensorImplPtr Ap = const_cast<TensorImplPtr>(A);
+        TensorImplPtr Bp = const_cast<TensorImplPtr>(B);
+        MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
+        MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
+        MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
         tblis::mult<double>(alpha, A_v, indices_A.c_str(), B_v,
                             indices_B.c_str(), beta, C_v, indices_C.c_str());
     }
diff --git a/test/test_tblis.cc b/test/test_tblis.cc
index 83d71e2..c364ca1 100644
--- a/test/test_tblis.cc
+++ b/test/test_tblis.cc
@@ -2954,14 +2954,14 @@ int main(int argc, char *argv[])
     success &=
         test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
     success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
-//    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
-//    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
-//    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
-//    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
-//    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
-//    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
-//    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
-//    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
+    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
+    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
+    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
+    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
+    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
+    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
+    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
+    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
     success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
     success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
     success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
@@ -2980,14 +2980,14 @@ int main(int argc, char *argv[])
     success &=
         test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
     success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
-//    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
-//    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
-//    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
-//    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
-//    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
-//    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
-//    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
-//    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
+    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
+    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
+    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
+    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
+    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
+    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
+    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
+    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
     success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
     success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
     success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
@@ -3006,14 +3006,14 @@ int main(int argc, char *argv[])
     success &=
         test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
     success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
-//    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
-//    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
-//    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
-//    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
-//    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
-//    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
-//    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
-//    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
+    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
+    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
+    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
+    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
+    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
+    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
+    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
+    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
     success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
     success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
     success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
@@ -3032,14 +3032,14 @@ int main(int argc, char *argv[])
     success &=
         test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
     success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
-//    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
-//    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
-//    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
-//    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
-//    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
-//    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
-//    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
-//    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
+    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
+    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
+    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
+    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
+    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
+    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
+    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
+    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
     success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
     success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
     success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
@@ -3058,14 +3058,14 @@ int main(int argc, char *argv[])
     success &=
         test_function(try_contract_hadamard, "Contract hadamard", kEpsilon);
     success &= test_function(try_contract_dot, "Contract dot", kEpsilon);
-//    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
-//    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
-//    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
-//    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
-//    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
-//    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
-//    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
-//    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
+    success &= test_function(try_contract_axpy1, "Contract axpy 1", kEpsilon);
+    success &= test_function(try_contract_axpy2, "Contract axpy 2", kEpsilon);
+    success &= test_function(try_contract_ger1, "Contract ger 1", kEpsilon);
+    success &= test_function(try_contract_ger2, "Contract ger 2", kEpsilon);
+    success &= test_function(try_contract_gemv1, "Contract gemv 1", kEpsilon);
+    success &= test_function(try_contract_gemv2, "Contract gemv 2", kEpsilon);
+    success &= test_function(try_contract_gemv3, "Contract gemv 3", kEpsilon);
+    success &= test_function(try_contract_gemv4, "Contract gemv 4", kEpsilon);
     success &= test_function(try_contract_gemm1, "Contract gemm 1", kEpsilon);
     success &= test_function(try_contract_gemm2, "Contract gemm 2", kEpsilon);
     success &= test_function(try_contract_gemm3, "Contract gemm 3", kEpsilon);
@@ -3074,22 +3074,22 @@ int main(int argc, char *argv[])
     success &= test_function(try_contract_gemm6, "Contract gemm 6", kEpsilon);
     success &= test_function(try_contract_gemm7, "Contract gemm 7", kEpsilon);
     success &= test_function(try_contract_gemm8, "Contract gemm 8", kEpsilon);
-//    printf("%s\n", std::string(82, '-').c_str());
-//    printf("Tests: %s\n\n", success ? "All Passed" : "Some Failed");
-
-//    printf("==> Contract Exceptions <==\n\n");
-//    success = true;
-//    printf("%s\n", std::string(82, '-').c_str());
-//    printf("%-50s %-9s %-9s %11s\n", "Description", "Expected", "Observed",
-//           "Delta");
-//    mode = 0;
-//    alpha = 1.0;
-//    beta = 0.0;
-//    printf("%s\n", std::string(82, '-').c_str());
-//    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
-//    printf("%s\n", std::string(82, '-').c_str());
-//    success &= test_function(try_contract_label_fail, "Contract Label Fail",
-//                             kException);
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Tests: %s\n\n", success ? "All Passed" : "Some Failed");
+
+    printf("==> Contract Exceptions <==\n\n");
+    success = true;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("%-50s %-9s %-9s %11s\n", "Description", "Expected", "Observed",
+           "Delta");
+    mode = 0;
+    alpha = 1.0;
+    beta = 0.0;
+    printf("%s\n", std::string(82, '-').c_str());
+    printf("Explicit: alpha = %11.3E, beta = %11.3E\n", alpha, beta);
+    printf("%s\n", std::string(82, '-').c_str());
+    success &= test_function(try_contract_label_fail, "Contract Label Fail",
+                             kException);
 //    success &= test_function(try_contract_einsum_fail1,
 //                             "Contract Einsum Fail 1", kException);
 //    success &= test_function(try_contract_einsum_fail2,

From c6cb5e346d61664fa6479bed7874e05997105a9b Mon Sep 17 00:00:00 2001
From: Francesco Evangelista <francesco.evangelista@gmail.com>
Date: Thu, 25 Jun 2020 09:58:34 -0400
Subject: [PATCH 04/10] Fix issue with Greek labels

---
 src/tensor/core/core.cc | 36 ++++++++++++++++++++++++++++++++++--
 src/tensor/indices.cc   |  2 +-
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/tensor/core/core.cc b/src/tensor/core/core.cc
index 3957ed0..270a083 100644
--- a/src/tensor/core/core.cc
+++ b/src/tensor/core/core.cc
@@ -157,6 +157,29 @@ std::string describe_contraction(ConstTensorImplPtr C, const Indices &Cinds,
     return buffer.str();
 }
 
+#ifdef HAVE_TBLIS
+vector<tblis::label_type> labels_to_num_indices(const Indices &inds,
+                                                vector<string> &labels)
+{
+    size_t n = inds.size();
+    vector<tblis::label_type> num_inds(n);
+    for (size_t k = 0; k < n; k++)
+    {
+        auto it = std::find(labels.begin(), labels.end(), inds[k]);
+        if (it != labels.end())
+        {
+            num_inds[k] = std::distance(labels.begin(), it);
+        }
+        else
+        {
+            labels.push_back(inds[k]);
+            num_inds[k] = labels.size() - 1;
+        }
+    }
+    return num_inds;
+}
+#endif
+
 } // anonymous namespace
 
 void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
@@ -172,6 +195,11 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
     size_t Bsize = Binds.size();
     size_t Csize = Cinds.size();
 
+    vector<string> labels;
+    vector<tblis::label_type> Aninds = labels_to_num_indices(Ainds, labels);
+    vector<tblis::label_type> Bninds = labels_to_num_indices(Binds, labels);
+    vector<tblis::label_type> Cninds = labels_to_num_indices(Cinds, labels);
+
     std::string indices_A = indices::to_string(Ainds, "");
     std::string indices_B = indices::to_string(Binds, "");
     std::string indices_C = indices::to_string(Cinds, "");
@@ -246,8 +274,12 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
         MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
         MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
         MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
-        tblis::mult<double>(alpha, A_v, indices_A.c_str(), B_v,
-                            indices_B.c_str(), beta, C_v, indices_C.c_str());
+        //        tblis::mult<double>(alpha, A_v, indices_A.c_str(), B_v,
+        //                            indices_B.c_str(), beta, C_v,
+        //                            indices_C.c_str());
+        tblis::mult<double>(alpha, A_v, &(Aninds.data()[0]), B_v,
+                            &(Bninds.data()[0]), beta, C_v,
+                            &(Cninds.data()[0]));
     }
 #else
     contract(A, B, Cinds, Ainds, Binds, A2, B2, C2, alpha, beta);
diff --git a/src/tensor/indices.cc b/src/tensor/indices.cc
index f477e49..82718d1 100644
--- a/src/tensor/indices.cc
+++ b/src/tensor/indices.cc
@@ -139,7 +139,7 @@ vector<size_t> permutation_order(const Indices &left, const Indices &right)
 
 int find_index_in_vector(const vector<std::string> &vec, const std::string &key)
 {
-    for (size_t ind = 0L; ind < vec.size(); ind++)
+    for (size_t ind = 0L, max_ind = vec.size(); ind < max_ind; ind++)
     {
         if (key == vec[ind])
         {

From 79c4cf2c4fe1c5e6966a2c3e6b4e75e62534903c Mon Sep 17 00:00:00 2001
From: Francesco Evangelista <francesco.evangelista@gmail.com>
Date: Thu, 25 Jun 2020 11:39:49 -0400
Subject: [PATCH 05/10] All tests pass

---
 src/tensor/core/core.cc | 114 +++++++++++++++++++++++++++++++++++-----
 test/test_tblis.cc      |  24 ++++-----
 2 files changed, 112 insertions(+), 26 deletions(-)

diff --git a/src/tensor/core/core.cc b/src/tensor/core/core.cc
index 270a083..bfccea7 100644
--- a/src/tensor/core/core.cc
+++ b/src/tensor/core/core.cc
@@ -178,6 +178,24 @@ vector<tblis::label_type> labels_to_num_indices(const Indices &inds,
     }
     return num_inds;
 }
+
+vector<size_t> make_dims_vec(const vector<tblis::label_type> &labels,
+                             const Dimension &dims,
+                             const vector<tblis::label_type> &unique_labels)
+{
+    vector<size_t> dims_vec(unique_labels.size(), 0);
+    size_t n = labels.size();
+    for (size_t k = 0; k < n; k++)
+    {
+        auto it =
+            std::find(unique_labels.begin(), unique_labels.end(), labels[k]);
+        if (it != unique_labels.end())
+        {
+            dims_vec[std::distance(unique_labels.begin(), it)] = dims[k];
+        }
+    }
+    return dims_vec;
+}
 #endif
 
 } // anonymous namespace
@@ -186,19 +204,83 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
                               const Indices &Cinds, const Indices &Ainds,
                               const Indices &Binds, double alpha, double beta)
 {
-    shared_ptr<TensorImpl> A2;
-    shared_ptr<TensorImpl> B2;
-    shared_ptr<TensorImpl> C2;
 #define TBLIS_DEBUG 0
 #ifdef HAVE_TBLIS
+    ambit::timer::timer_push("pre-TBLIS: internal overhead");
+
     size_t Asize = Ainds.size();
     size_t Bsize = Binds.size();
     size_t Csize = Cinds.size();
 
     vector<string> labels;
-    vector<tblis::label_type> Aninds = labels_to_num_indices(Ainds, labels);
-    vector<tblis::label_type> Bninds = labels_to_num_indices(Binds, labels);
-    vector<tblis::label_type> Cninds = labels_to_num_indices(Cinds, labels);
+    vector<tblis::label_type> A_labels = labels_to_num_indices(Ainds, labels);
+    vector<tblis::label_type> B_labels = labels_to_num_indices(Binds, labels);
+    vector<tblis::label_type> C_labels = labels_to_num_indices(Cinds, labels);
+
+    // Determine unique indices
+    vector<tblis::label_type> unique_labels;
+    unique_labels.insert(unique_labels.end(), A_labels.begin(), A_labels.end());
+    unique_labels.insert(unique_labels.end(), B_labels.begin(), B_labels.end());
+    unique_labels.insert(unique_labels.end(), C_labels.begin(), C_labels.end());
+    std::sort(unique_labels.begin(), unique_labels.end());
+    auto it = std::unique(unique_labels.begin(), unique_labels.end());
+    unique_labels.resize(std::distance(unique_labels.begin(), it));
+
+    vector<size_t> A_dims_vec =
+        make_dims_vec(A_labels, A->dims(), unique_labels);
+    vector<size_t> B_dims_vec =
+        make_dims_vec(B_labels, B->dims(), unique_labels);
+    vector<size_t> C_dims_vec =
+        make_dims_vec(C_labels, this->dims(), unique_labels);
+
+    for (size_t ind = 0; ind < unique_labels.size(); ind++)
+    {
+//        std::cout << "\n " << (char)(unique_labels[ind] + 65);
+//        std::cout << "  " << A_dims_vec[ind];
+//        std::cout << "  " << B_dims_vec[ind];
+//        std::cout << "  " << C_dims_vec[ind];
+//    std::cout << std::endl;
+        auto A_dim = A_dims_vec[ind];
+        auto B_dim = B_dims_vec[ind];
+        auto C_dim = C_dims_vec[ind];
+        if (((A_dim == 0) and (B_dim == 0)) or
+            ((A_dim == 0) and (C_dim == 0)) or ((B_dim == 0) and (C_dim == 0)))
+        {
+            throw std::runtime_error(
+                "Invalid contraction topology - index only occurs "
+                "once.\n" +
+                describe_contraction(this, Cinds, A, Ainds, B, Binds, alpha,
+                                     beta));
+        }
+        if ((B_dim == 0) and (A_dim != C_dim))
+        {
+            throw std::runtime_error("Invalid AC (Left) index size\n" +
+                                     describe_contraction(this, Cinds, A, Ainds,
+                                                          B, Binds, alpha,
+                                                          beta));
+        }
+        if ((A_dim == 0) and (B_dim != C_dim))
+        {
+            throw std::runtime_error("Invalid BC (Right) index size\n" +
+                                     describe_contraction(this, Cinds, A, Ainds,
+                                                          B, Binds, alpha,
+                                                          beta));
+        }
+        if ((C_dim == 0) and (A_dim != B_dim))
+        {
+            throw std::runtime_error("Invalid AB (Contraction) index size\n" +
+                                     describe_contraction(this, Cinds, A, Ainds,
+                                                          B, Binds, alpha,
+                                                          beta));
+        }
+        if ((A_dim * B_dim * C_dim != 0) and
+            ((A_dim != B_dim) or (B_dim != C_dim)))
+        {
+            throw std::runtime_error("Invalid ABC (Hadamard) index size\n" +
+                                     describe_contraction(this, Cinds, A, Ainds, B,
+                                                          Binds, alpha, beta));
+        }
+    }
 
     std::string indices_A = indices::to_string(Ainds, "");
     std::string indices_B = indices::to_string(Binds, "");
@@ -223,6 +305,10 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
     std::cout << std::endl;
 #endif
 
+ambit::timer::timer_pop();
+
+    ambit::timer::timer_push("BLAS");
+
     // Scalar multiplication
     if ((Asize == 0) and (Bsize == 0) and (Csize == 0))
     {
@@ -274,18 +360,18 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
         MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
         MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
         MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
-        //        tblis::mult<double>(alpha, A_v, indices_A.c_str(), B_v,
-        //                            indices_B.c_str(), beta, C_v,
-        //                            indices_C.c_str());
-        tblis::mult<double>(alpha, A_v, &(Aninds.data()[0]), B_v,
-                            &(Bninds.data()[0]), beta, C_v,
-                            &(Cninds.data()[0]));
+        tblis::mult<double>(alpha, A_v, &(A_labels.data()[0]), B_v,
+                            &(B_labels.data()[0]), beta, C_v,
+                            &(C_labels.data()[0]));
     }
+    ambit::timer::timer_pop();
 #else
+    shared_ptr<TensorImpl> A2;
+    shared_ptr<TensorImpl> B2;
+    shared_ptr<TensorImpl> C2;
     contract(A, B, Cinds, Ainds, Binds, A2, B2, C2, alpha, beta);
-
 #endif
-}
+} // namespace ambit
 
 void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
                               const Indices &Cinds, const Indices &Ainds,
diff --git a/test/test_tblis.cc b/test/test_tblis.cc
index c364ca1..2207a67 100644
--- a/test/test_tblis.cc
+++ b/test/test_tblis.cc
@@ -3090,18 +3090,18 @@ int main(int argc, char *argv[])
     printf("%s\n", std::string(82, '-').c_str());
     success &= test_function(try_contract_label_fail, "Contract Label Fail",
                              kException);
-//    success &= test_function(try_contract_einsum_fail1,
-//                             "Contract Einsum Fail 1", kException);
-//    success &= test_function(try_contract_einsum_fail2,
-//                             "Contract Einsum Fail 2", kException);
-//    success &= test_function(try_contract_size_fail1, "Contract Size Fail 1",
-//                             kException);
-//    success &= test_function(try_contract_size_fail2, "Contract Size Fail 2",
-//                             kException);
-//    success &= test_function(try_contract_size_fail3, "Contract Size Fail 3",
-//                             kException);
-//    success &= test_function(try_contract_size_fail4, "Contract Size Fail 4",
-//                             kException);
+    success &= test_function(try_contract_einsum_fail1,
+                             "Contract Einsum Fail 1", kException);
+    success &= test_function(try_contract_einsum_fail2,
+                             "Contract Einsum Fail 2", kException);
+    success &= test_function(try_contract_size_fail1, "Contract Size Fail 1",
+                             kException);
+    success &= test_function(try_contract_size_fail2, "Contract Size Fail 2",
+                             kException);
+    success &= test_function(try_contract_size_fail3, "Contract Size Fail 3",
+                             kException);
+    success &= test_function(try_contract_size_fail4, "Contract Size Fail 4",
+                             kException);
     printf("%s\n", std::string(82, '-').c_str());
     printf("Tests: %s\n\n", success ? "All Passed" : "Some Failed");
 

From d25bc625241f1bc68c66b4e36d6ada5beb85a4c3 Mon Sep 17 00:00:00 2001
From: Francesco Evangelista <francesco.evangelista@gmail.com>
Date: Thu, 25 Jun 2020 16:44:04 -0400
Subject: [PATCH 06/10] Cleanup PR

---
 src/tensor/core/core.cc | 185 +++++++++++++++++++---------------------
 src/tensor/core/core.h  |  43 ++++++++--
 2 files changed, 126 insertions(+), 102 deletions(-)

diff --git a/src/tensor/core/core.cc b/src/tensor/core/core.cc
index bfccea7..253d985 100644
--- a/src/tensor/core/core.cc
+++ b/src/tensor/core/core.cc
@@ -138,13 +138,13 @@ void CoreTensorImpl::set(double alpha)
 
 namespace
 {
-
 std::string describe_tensor(ConstTensorImplPtr A, const Indices &Ainds)
 {
     std::ostringstream buffer;
     buffer << A->name() << "[" << indices::to_string(Ainds) << "]";
     return buffer.str();
 }
+
 std::string describe_contraction(ConstTensorImplPtr C, const Indices &Cinds,
                                  ConstTensorImplPtr A, const Indices &Ainds,
                                  ConstTensorImplPtr B, const Indices &Binds,
@@ -156,10 +156,11 @@ std::string describe_contraction(ConstTensorImplPtr C, const Indices &Cinds,
            << describe_tensor(B, Binds);
     return buffer.str();
 }
+} // anonymous namespace
 
 #ifdef HAVE_TBLIS
-vector<tblis::label_type> labels_to_num_indices(const Indices &inds,
-                                                vector<string> &labels)
+vector<tblis::label_type> indices_to_tblis_labels(const Indices &inds,
+                                                  vector<string> &labels)
 {
     size_t n = inds.size();
     vector<tblis::label_type> num_inds(n);
@@ -179,9 +180,10 @@ vector<tblis::label_type> labels_to_num_indices(const Indices &inds,
     return num_inds;
 }
 
-vector<size_t> make_dims_vec(const vector<tblis::label_type> &labels,
-                             const Dimension &dims,
-                             const vector<tblis::label_type> &unique_labels)
+vector<size_t>
+make_vector_of_indices_dims(const vector<tblis::label_type> &labels,
+                            const Dimension &dims,
+                            const vector<tblis::label_type> &unique_labels)
 {
     vector<size_t> dims_vec(unique_labels.size(), 0);
     size_t n = labels.size();
@@ -196,28 +198,26 @@ vector<size_t> make_dims_vec(const vector<tblis::label_type> &labels,
     }
     return dims_vec;
 }
-#endif
 
-} // anonymous namespace
-
-void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
-                              const Indices &Cinds, const Indices &Ainds,
-                              const Indices &Binds, double alpha, double beta)
+void CoreTensorImpl::contract_tblis(ConstTensorImplPtr A, ConstTensorImplPtr B,
+                                    const Indices &Cinds, const Indices &Ainds,
+                                    const Indices &Binds, double alpha,
+                                    double beta)
 {
-#define TBLIS_DEBUG 0
-#ifdef HAVE_TBLIS
     ambit::timer::timer_push("pre-TBLIS: internal overhead");
 
     size_t Asize = Ainds.size();
     size_t Bsize = Binds.size();
     size_t Csize = Cinds.size();
 
+    // convert a vector of strings into a vector of chars. This allows us to use
+    // more complex indices in ambit and still interface with TBLIS
     vector<string> labels;
-    vector<tblis::label_type> A_labels = labels_to_num_indices(Ainds, labels);
-    vector<tblis::label_type> B_labels = labels_to_num_indices(Binds, labels);
-    vector<tblis::label_type> C_labels = labels_to_num_indices(Cinds, labels);
+    vector<tblis::label_type> A_labels = indices_to_tblis_labels(Ainds, labels);
+    vector<tblis::label_type> B_labels = indices_to_tblis_labels(Binds, labels);
+    vector<tblis::label_type> C_labels = indices_to_tblis_labels(Cinds, labels);
 
-    // Determine unique indices
+    // determine unique contraction indices
     vector<tblis::label_type> unique_labels;
     unique_labels.insert(unique_labels.end(), A_labels.begin(), A_labels.end());
     unique_labels.insert(unique_labels.end(), B_labels.begin(), B_labels.end());
@@ -227,19 +227,16 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
     unique_labels.resize(std::distance(unique_labels.begin(), it));
 
     vector<size_t> A_dims_vec =
-        make_dims_vec(A_labels, A->dims(), unique_labels);
+        make_vector_of_indices_dims(A_labels, A->dims(), unique_labels);
     vector<size_t> B_dims_vec =
-        make_dims_vec(B_labels, B->dims(), unique_labels);
+        make_vector_of_indices_dims(B_labels, B->dims(), unique_labels);
     vector<size_t> C_dims_vec =
-        make_dims_vec(C_labels, this->dims(), unique_labels);
+        make_vector_of_indices_dims(C_labels, this->dims(), unique_labels);
 
+    // test for common issues (note that some of these contractions may be valid
+    // for TBLIS but are not supported by ambit
     for (size_t ind = 0; ind < unique_labels.size(); ind++)
     {
-//        std::cout << "\n " << (char)(unique_labels[ind] + 65);
-//        std::cout << "  " << A_dims_vec[ind];
-//        std::cout << "  " << B_dims_vec[ind];
-//        std::cout << "  " << C_dims_vec[ind];
-//    std::cout << std::endl;
         auto A_dim = A_dims_vec[ind];
         auto B_dim = B_dims_vec[ind];
         auto C_dim = C_dims_vec[ind];
@@ -277,38 +274,14 @@ void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
             ((A_dim != B_dim) or (B_dim != C_dim)))
         {
             throw std::runtime_error("Invalid ABC (Hadamard) index size\n" +
-                                     describe_contraction(this, Cinds, A, Ainds, B,
-                                                          Binds, alpha, beta));
+                                     describe_contraction(this, Cinds, A, Ainds,
+                                                          B, Binds, alpha,
+                                                          beta));
         }
     }
-
-    std::string indices_A = indices::to_string(Ainds, "");
-    std::string indices_B = indices::to_string(Binds, "");
-    std::string indices_C = indices::to_string(Cinds, "");
-
-#if TBLIS_DEBUG
-    std::cout << "Calling TBLIS" << std::endl;
-    std::cout << "\n indices_A " << indices_A;
-    auto Adims = A->dims();
-    for (auto d : Adims)
-        std::cout << " " << d;
-    std::cout << std::endl;
-    std::cout << "\n indices_B " << indices_B;
-    auto Bdims = B->dims();
-    for (auto d : Bdims)
-        std::cout << " " << d;
-    std::cout << std::endl;
-    std::cout << "\n indices_C " << indices_C;
-    auto Cdims = this->dims();
-    for (auto d : Cdims)
-        std::cout << " " << d;
-    std::cout << std::endl;
-#endif
-
-ambit::timer::timer_pop();
+    ambit::timer::timer_pop();
 
     ambit::timer::timer_push("BLAS");
-
     // Scalar multiplication
     if ((Asize == 0) and (Bsize == 0) and (Csize == 0))
     {
@@ -320,8 +293,8 @@ ambit::timer::timer_pop();
         TensorImplPtr Ap = const_cast<TensorImplPtr>(A);
         MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
         MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
-        tblis::add<double>(alpha * B->data()[0], A_v, indices_A.c_str(), beta,
-                           C_v, indices_C.c_str());
+        tblis::add<double>(alpha * B->data()[0], A_v, A_labels.data(), beta,
+                           C_v, C_labels.data());
     }
     // Index Type B -> add operation
     else if ((Asize == 0) and (Bsize == Csize))
@@ -329,8 +302,8 @@ ambit::timer::timer_pop();
         TensorImplPtr Bp = const_cast<TensorImplPtr>(B);
         MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
         MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
-        tblis::add<double>(alpha * A->data()[0], B_v, indices_B.c_str(), beta,
-                           C_v, indices_C.c_str());
+        tblis::add<double>(alpha * A->data()[0], B_v, B_labels.data(), beta,
+                           C_v, C_labels.data());
     }
     // Index Type AB -> dot operation
     else if ((Asize > 0) and (Bsize > 0) and (Csize == 0))
@@ -339,39 +312,76 @@ ambit::timer::timer_pop();
         TensorImplPtr Bp = const_cast<TensorImplPtr>(B);
         MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
         MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
-        data_[0] = alpha * tblis::dot<double>(A_v, indices_A.c_str(), B_v,
-                                              indices_B.c_str()) +
+        data_[0] = alpha * tblis::dot<double>(A_v, A_labels.data(), B_v,
+                                              B_labels.data()) +
                    beta * data_[0];
     }
     // Index Type ABC -> multp operation
     else if ((Asize > 0) and (Bsize > 0) and (Csize > 0))
     {
-#if TBLIS_DEBUG
-        std::cout << "\n Calling mult " << std::endl;
-        std::cout << "\n Get Ap " << std::endl;
-        std::cout << "\n Get Bp " << std::endl;
-        std::cout << "\n Get A_v " << std::endl;
-        std::cout << "\n Get B_v " << std::endl;
-        std::cout << "\n Get C_v " << std::endl;
-        std::cout << "\n Call mult " << std::endl;
-#endif
-        TensorImplPtr Ap = const_cast<TensorImplPtr>(A);
-        TensorImplPtr Bp = const_cast<TensorImplPtr>(B);
-        MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
-        MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
-        MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
-        tblis::mult<double>(alpha, A_v, &(A_labels.data()[0]), B_v,
-                            &(B_labels.data()[0]), beta, C_v,
-                            &(C_labels.data()[0]));
+        // check if this is a straight dgemm
+        bool straight_dgemm = true;
+        for (size_t a = 0; a < Asize; a++)
+        {
+            if (a < Csize)
+            {
+                if (A_labels[a] != C_labels[a])
+                    straight_dgemm = false;
+            }
+            else
+            {
+                straight_dgemm = false;
+            }
+        }
+        for (size_t b = 0; b < Bsize; b++)
+        {
+            if (b + Asize < Csize)
+            {
+                if (B_labels[b] != C_labels[b + Asize])
+                    straight_dgemm = false;
+            }
+            else
+            {
+                straight_dgemm = false;
+            }
+        }
+
+        if (straight_dgemm)
+        {
+            shared_ptr<TensorImpl> A2;
+            shared_ptr<TensorImpl> B2;
+            shared_ptr<TensorImpl> C2;
+            contract(A, B, Cinds, Ainds, Binds, A2, B2, C2, alpha, beta);
+        }
+        else
+        {
+            // check if all indices are aligned for a straight DGEMM
+            TensorImplPtr Ap = const_cast<TensorImplPtr>(A);
+            TensorImplPtr Bp = const_cast<TensorImplPtr>(B);
+            MArray::varray_view<double> A_v(Ap->dims(), &(Ap->data()[0]));
+            MArray::varray_view<double> B_v(Bp->dims(), &(Bp->data()[0]));
+            MArray::varray_view<double> C_v(this->dims(), &(this->data()[0]));
+            tblis::mult<double>(alpha, A_v, A_labels.data(), B_v,
+                                B_labels.data(), beta, C_v, C_labels.data());
+        }
     }
     ambit::timer::timer_pop();
+}
+#endif
+
+void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
+                              const Indices &Cinds, const Indices &Ainds,
+                              const Indices &Binds, double alpha, double beta)
+{
+#ifdef HAVE_TBLIS
+    contract_tblis(A, B, Cinds, Ainds, Binds, alpha, beta);
 #else
     shared_ptr<TensorImpl> A2;
     shared_ptr<TensorImpl> B2;
     shared_ptr<TensorImpl> C2;
     contract(A, B, Cinds, Ainds, Binds, A2, B2, C2, alpha, beta);
 #endif
-} // namespace ambit
+}
 
 void CoreTensorImpl::contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
                               const Indices &Cinds, const Indices &Ainds,
@@ -1615,25 +1625,6 @@ map<string, TensorImplPtr> CoreTensorImpl::gesvd() const
     return results;
 }
 
-// TensorImplPtr CoreTensorImpl::cholesky() const
-//{
-//    ThrowNotImplementedException;
-//}
-//
-// map<string, TensorImplPtr> CoreTensorImpl::lu() const
-//{
-//    ThrowNotImplementedException;
-//}
-// map<string, TensorImplPtr> CoreTensorImpl::qr() const
-//{
-//    ThrowNotImplementedException;
-//}
-//
-// TensorImplPtr CoreTensorImpl::cholesky_inverse() const
-//{
-//    ThrowNotImplementedException;
-//}
-
 TensorImplPtr CoreTensorImpl::inverse() const
 {
     squareCheck(this, true);
diff --git a/src/tensor/core/core.h b/src/tensor/core/core.h
index 11830d4..900fd47 100644
--- a/src/tensor/core/core.h
+++ b/src/tensor/core/core.h
@@ -83,6 +83,22 @@ class CoreTensorImpl : public TensorImpl
     void permute(ConstTensorImplPtr A, const Indices &Cinds,
                  const Indices &Ainds, double alpha = 1.0, double beta = 0.0);
 
+    /**
+     * Perform the following contraction using the TBLIS library
+     *  C(Cinds) = alpha * A(Ainds) * B(Binds) + beta * C(Cinds)
+     *
+     * Parameters:
+     *  @param A The left-side factor tensor, e.g., A2
+     *  @param B The right-side factor tensor, e.g., B2
+     *  @param Cinds The indices of tensor C, e.g., "ij"
+     *  @param Ainds The indices of tensor A, e.g., "ik"
+     *  @param Binds The indices of tensor B, e.g., "jk"
+     *  @param alpha The scale applied to the product A*B, e.g., 0.5
+     *  @param beta The scale applied to the tensor C, e.g., 1.0
+     *
+     * Results:
+     *  C is the current tensor, whose data is overwritten. e.g., C2
+     **/
     void contract(ConstTensorImplPtr A, ConstTensorImplPtr B,
                   const Indices &Cinds, const Indices &Ainds,
                   const Indices &Binds, double alpha = 1.0, double beta = 0.0);
@@ -94,6 +110,28 @@ class CoreTensorImpl : public TensorImpl
                   std::shared_ptr<TensorImpl> &C2, double alpha = 1.0,
                   double beta = 0.0);
 
+#if HAVE_TBLIS
+    /**
+     * Perform the following contraction using the TBLIS library
+     *  C(Cinds) = alpha * A(Ainds) * B(Binds) + beta * C(Cinds)
+     *
+     * Parameters:
+     *  @param A The left-side factor tensor, e.g., A2
+     *  @param B The right-side factor tensor, e.g., B2
+     *  @param Cinds The indices of tensor C, e.g., "ij"
+     *  @param Ainds The indices of tensor A, e.g., "ik"
+     *  @param Binds The indices of tensor B, e.g., "jk"
+     *  @param alpha The scale applied to the product A*B, e.g., 0.5
+     *  @param beta The scale applied to the tensor C, e.g., 1.0
+     *
+     * Results:
+     *  C is the current tensor, whose data is overwritten. e.g., C2
+     **/
+    void contract_tblis(ConstTensorImplPtr A, ConstTensorImplPtr B,
+                        const Indices &Cinds, const Indices &Ainds,
+                        const Indices &Binds, double alpha, double beta);
+#endif
+
     void gemm(ConstTensorImplPtr A, ConstTensorImplPtr B, bool transA,
               bool transB, size_t nrow, size_t ncol, size_t nzip, size_t ldaA,
               size_t ldaB, size_t ldaC, size_t offA = 0L, size_t offB = 0L,
@@ -105,11 +143,6 @@ class CoreTensorImpl : public TensorImpl
     map<string, TensorImplPtr> geev(EigenvalueOrder order) const;
     map<string, TensorImplPtr> gesvd() const;
 
-    // TensorImplPtr cholesky() const;
-    // std::map<string, TensorImplPtr> lu() const;
-    // std::map<string, TensorImplPtr> qr() const;
-
-    // TensorImplPtr cholesky_inverse() const;
     TensorImplPtr inverse() const;
     TensorImplPtr power(double power, double condition = 1.0E-12) const;
 

From ea6b7ae87a62262f6a712dd4bc9c44b1714faf58 Mon Sep 17 00:00:00 2001
From: Francesco Evangelista <francesco.evangelista@gmail.com>
Date: Thu, 25 Jun 2020 16:45:30 -0400
Subject: [PATCH 07/10] Tweaks in CMakeFiles.txt

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64b2770..e0fb25d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,13 +135,14 @@ if(ENABLE_CYCLOPS AND CYCLOPS)
     include_directories(${CYCLOPS}/include)
     add_definitions(-DHAVE_CYCLOPS)
 endif()
+
 if(ENABLE_TBLIS AND TBLIS)
     set(CMAKE_CXX_STANDARD 17)
     include_directories(${TBLIS}/include)
     add_definitions(-DHAVE_TBLIS)
-    link_directories("/Users/fevange/Bin/tblis/lib")
     link_libraries("${TBLIS}/lib/libtblis.a" "${TBLIS}/lib/libtci.a")
 endif()
+
 if (ENABLE_ELEMENTAL AND ELEMENTAL)
     include_directories(${ELEMENTAL}/include)
     add_definitions(-DHAVE_ELEMENTAL)

From 3d6e64596544ab704a7ed3073be52d4b09bb81d1 Mon Sep 17 00:00:00 2001
From: fevangelista <francesco.evangelista@gmail.com>
Date: Thu, 25 Jun 2020 16:49:45 -0400
Subject: [PATCH 08/10] Update CMakeLists.txt

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e0fb25d..3f438de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ set(ambit_DESCRIPTION  "C++ library for the implementation of tensor product cal
 set(ambit_URL          "https://github.com/jturney/ambit")
 set(ambit_LICENSE      "GNU Lesser General Public License v3 (LGPL-3.0)")
 
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(TargetOpenMP_FIND_COMPONENTS "CXX")
 

From 30188e40ee7dd8b28f916f4807134b69481e4551 Mon Sep 17 00:00:00 2001
From: Francesco Evangelista <francesco.evangelista@gmail.com>
Date: Thu, 25 Jun 2020 17:04:20 -0400
Subject: [PATCH 09/10] Small change

---
 src/tensor/core/core.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/tensor/core/core.cc b/src/tensor/core/core.cc
index 323953d..a6ea9c8 100644
--- a/src/tensor/core/core.cc
+++ b/src/tensor/core/core.cc
@@ -28,12 +28,11 @@
  * @END LICENSE
  */
 
-
-#include <numeric>
 #include <algorithm>
 #include <cmath>
 #include <iostream>
 #include <limits>
+#include <numeric>
 #include <stdexcept>
 #include <string.h>
 

From 8e2896b2b3037694fe574a046fef340ba4f8a1c0 Mon Sep 17 00:00:00 2001
From: fevangelista <francesco.evangelista@gmail.com>
Date: Thu, 25 Jun 2020 17:08:32 -0400
Subject: [PATCH 10/10] Update core.cc

---
 src/tensor/core/core.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tensor/core/core.cc b/src/tensor/core/core.cc
index a6ea9c8..dd156d3 100644
--- a/src/tensor/core/core.cc
+++ b/src/tensor/core/core.cc
@@ -301,7 +301,7 @@ void CoreTensorImpl::contract_tblis(ConstTensorImplPtr A, ConstTensorImplPtr B,
     }
     ambit::timer::timer_pop();
 
-    ambit::timer::timer_push("BLAS");
+    ambit::timer::timer_push("TBLIS");
     // Scalar multiplication
     if ((Asize == 0) and (Bsize == 0) and (Csize == 0))
     {