diff --git a/.github/workflows/py_test.yml b/.github/workflows/py_test.yml
new file mode 100644
index 0000000..95b358d
--- /dev/null
+++ b/.github/workflows/py_test.yml
@@ -0,0 +1,34 @@
+name: CI - Python Tests
+
+# Trigger the workflow on pushes and pull requests involving Python files
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - 'src/**'      # Monitor changes in src directory
+      - 'include/**'  # Monitor changes in include directory
+  pull_request:
+    paths:
+      - 'src/**'
+      - 'include/**'
+
+jobs:
+  python-tests:
+    runs-on: ubuntu-latest  # Use the latest Ubuntu virtual environment
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'  # Specify the Python version you need
+
+      - name: Install project and dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .  # Install the project from the root directory
+
+      - name: Run Python unit tests
+        run: |
+          python -m unittest discover -s tests -p '*_test.py'  # Discover and run all unittests in the 'tests' folder
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index de7e739..fa6e840 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ CTestTestfile.cmake
 *.user
 *.swp
 *.tmp
+*.egg-info
 
 # IDE specific files (if you're using any IDE)
 # Uncomment for Visual Studio Code
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5424b75..ee2e5cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,20 +4,9 @@ cmake_minimum_required(VERSION 3.10)
 project(SplineNetLib VERSION 1.0)
 
 # Set the C++ standard
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
-#include fetch content for catch2
-include(FetchContent)
-
-# fetch catch2 and make usable
-FetchContent_Declare(
-    Catch2
-    GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-    GIT_TAG v3.4.0
-)
-FetchContent_MakeAvailable(Catch2)
-
 # Optionally enable warnings for all compilers
 if(MSVC)
     add_compile_options(/W4)
@@ -35,12 +24,29 @@ add_library(SplineNetLib
     src/splines.cpp
 )
 
+# Add the new template-based class headers and implementations
+target_sources(SplineNetLib PRIVATE
+    src/CTensor.tpp
+    src/CTensorFunc.tpp
+    src/CTensorUtils.tpp
+)
+
 # Specify the include directories for the library target
 target_include_directories(SplineNetLib PUBLIC ${PROJECT_SOURCE_DIR}/include)
 
 option(ENABLE_TESTS "allow catch2 install and tests to run" OFF)
 
 if(ENABLE_TESTS)
+    #include fetch content for catch2
+    include(FetchContent)
+    
+    # fetch catch2 and make usable
+    FetchContent_Declare(
+        Catch2
+        GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+        GIT_TAG v3.4.0
+    )
+    FetchContent_MakeAvailable(Catch2)
 
     #enable testing
     enable_testing()
@@ -67,6 +73,12 @@ add_executable(SplineNetExample examples/example_network.cpp)
 # Link the example with the library
 target_link_libraries(SplineNetExample PRIVATE SplineNetLib)
 
+# Add an example or test executable 
+add_executable(SplineNetExampleTensor examples/example_CTensor.cpp)
+
+# Link the example with the library
+target_link_libraries(SplineNetExampleTensor PRIVATE SplineNetLib)
+
 # Optional: Install the library and headers
 install(TARGETS SplineNetLib DESTINATION lib)
 install(DIRECTORY include/SplineNetLib DESTINATION include)
@@ -85,3 +97,9 @@ write_basic_package_version_file(
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/SplineNetLibConfigVersion.cmake" DESTINATION lib/cmake/SplineNetLib)
 
 
+
+
+
+
+
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..91ab0ca
--- /dev/null
+++ b/README.md
@@ -0,0 +1,105 @@
+# About Spline-based-DeepLearning
+
+## bugs
+
+* reshaping a CTensor after performing operations on it may break the gradient calculation!
+
+## Table of contents
+
+[goals](#goals)
+
+[C++ spline documentation](docs/cpp_splines.md)
+
+[C++ CTensor documentation](docs/cpp_CTensor.md)
+
+[python spline documentation](docs/py_splines.md)
+
+
+## New:
+
+* python version of the spline and layer classes
+
+see [install for python](#install-for-python) to install
+
+* batch compatibility for layers 
+
+* CTensor class (tensor class with automatic computation graph and gradient propagation)
+
+* python version for CTensor
+
+**documentation was not yet updated some features might have changed and new features were added**
+
+**updates will follow soon**
+
+## goals
+
+1. create visual representations for neural networks by replacing commonly used fully connected layers with spline based layers.
+2. achieve similar or better precision to common deep learning approaches whilst keeping the structure as light-wheight and fast as possible.
+3. allow easy adaptability to existing architectures like convolutional and recurrent networks.
+
+## install for c++
+
+```txt
+git clone https://github.com/K-T0BIAS/Spline-based-DeepLearning.git
+cd Spline-based-DeepLearning
+mkdir build
+cd build
+cmake ..
+make
+make install or make install DESTDIR=/path_to_desired_directory
+```
+to run the example : ./SplineNetExample
+
+## include
+
+in .cpp:
+```cpp
+#include "SplineNetLib/Network.hpp"
+```
+
+in the projects cmake:
+```txt
+cmake_minimum_required(VERSION 3.10)
+project(YourProjectDirectory)
+
+# Set C++ standard
+set(CMAKE_CXX_STANDARD 17)
+
+# Find SplineNetLib package
+find_package(SplineNetLib REQUIRED)
+
+# Add executable and link with SplineNetLib
+add_executable(YourProjectDirectory main.cpp)
+target_link_libraries(YourProjectDirectory PRIVATE SplineNetLib)
+```
+ 
+or in terminal:
+```txt
+g++ -std=c++17 -I/path_to_include -L/path_to_lib -lSplineNetLib main.cpp -o YourProjectDirectory 
+```
+* Replace /path_to_include with the path to the installed include directory.
+
+* Replace /path_to_lib with the path where libSplineNetLib.a is located.
+
+## install for python
+
+**Note this only includes splines and layer, no Network class**
+
+**REQUIRED: pybind11, setuptools, wheel (if not already install these with pip)**
+
+```txt
+git clone https://github.com/K-T0BIAS/Spline-based-DeepLearning.git
+cd Spline-based-DeepLearning
+pip install .
+```
+
+
+## License
+
+This project is licensed under the Mozilla Public License 2.0. 
+
+Copyright (c) 2024 Tobias Karusseit. See the [LICENSE](./LICENSE) file for details.
+
+This project also uses `pybind11`, which is licensed under the MIT License. See [pybind11 GitHub](https://github.com/pybind/pybind11) for more details.
+
+This project also uses `Catch2`, which is licensed under the Boost Software License 1.0. See [Catch2 GitHub](https://github.com/catchorg/Catch2) for more details.
\ No newline at end of file
diff --git a/docs/cpp_CTensor.md b/docs/cpp_CTensor.md
new file mode 100644
index 0000000..7ab2c38
--- /dev/null
+++ b/docs/cpp_CTensor.md
@@ -0,0 +1,201 @@
+## CPP CTensor Documentation
+
+### include
+
+first include the library header
+
+```cpp 
+#include "SplineNetLib/SplineNet.hpp"
+```
+
+### CTensor constructors
+
+The CTensor class is usefull to perform tensor operations while automatically tracking the operations that a CTensor was involved with.
+A CTensor stores the N dimensional data in a flat projected vector (std::vector<T>) alongside it's actual shape (std::vector<size_t>).
+It will also store all arithmetic functions that it was used in or created from in a grad_fn vector (std::vector<std::unique_ptr<Function<T>>>). Important to note here is that a CTensor only gets a new grad_fn if it was the direct result of an operation (e.g. c = a + b , here only c gets the grad_fn entry).
+grad_fns are classes that hold information about the parents of a CTensor (e.g. c = a + b, here c gets a new grad_fn that knows that a and b are the parents). They also have functions that determine the behaviour of the gradient propagation. 
+Calling the backward function on one CTensor will automatically calculate the respective gradients of all other CTensors in the graph.
+
+**Note** that the CTensor architecture was inspired by the pytorch tensor architecture. Read more here : [pytorch](https://github.com/pytorch/pytorch)
+
+CTensors have multiple constructor options:
+
+1: construct from nested vector
+
+```cpp
+std::vecor<std::vector<float>> data = {{1,2,3},{4,5,6}};
+
+auto CTensor_instance = SplineNetLib::CTensor(data);
+```
+
+this creates a CTensor of shape {2,3}.
+**Note** that new CTensors always have their requires gradient flag set to True.
+
+2: construct from flat initializer list with initializer list of shape:
+
+```cpp
+auto CTensor_instance = SplineNetLib::CTensor({1.0,2.0,3.0,4.0,5.0,6.0}, {2,3});
+```
+
+this will result in the same CTensor as in the previous constructor
+
+3: construct from flat vector and shape
+
+```cpp
+std::vector<size_t> shape = {2,3};
+std::vector<float> data = {{1,2,3},{4,5,6}};
+
+auto CTensor_instance = SplineNetLib::CTensor(data, shape);
+```
+
+4: construct from existing CTensor (shallow copy)
+
+```cpp
+auto first_CTensor = SplineNetLib::CTensor({1.0,2.0,3.0,4.0,5.0,6.0}, {2,3});
+
+auto new_CTensor = SplineNetLib::CTensor(first_CTensor);
+```
+
+**Note** this creates a shallow copy any changes to each will affect the other
+
+4.1: deep copy / clone
+
+If a exact copy of a CTensor, that is independent, is needed do:
+
+```cpp
+auto first_CTensor = SplineNetLib::CTensor({1.0,2.0,3.0,4.0,5.0,6.0}, {2,3});
+
+auto new_CTensor = first_CTensor.clone();
+```
+
+this will create a deep copy of "first_CTensor"
+
+### CTensor getter functions
+
+#### data()
+
+this returns the inner data vector from the CTensor **Note** that this data vector is the flat representation of the CTensor.
+
+example:
+
+```cpp
+auto CTensor_instance = SplineNetLib::CTensor({1,2,3,4,5,6},{2,3});
+auto data = CTensor_instance.data();
+```
+
+here data will be a vector<T> like {1,2,3,4,5,6}, where 'T' is the datatype of CTensor_instance.
+
+#### shape()
+
+this returns the shape of the CTensor
+
+example:
+```cpp
+auto CTensor_instance = SplineNetLib::CTensor({1,2,3,4,5,6},{2,3});
+auto data = CTensor_instance.shape();
+```
+
+this returns a vector<size_t> = {2,3}.
+
+### CTensor shape related functions
+
+#### squeeze
+
+squeeze will remove the indexed dimension from the shape. **Note** that the tensor size will remain the same and the size of the adjacent dimension will increase.
+
+syntax:
+```cpp
+Ctensor.squeeze(size_t dim);
+```
+
+example:
+```cpp
+auto CTensor_instance = SplineNetLib::CTensor({1,2,3},{1,3});
+CTensor_instance.squeeze(0);
+```
+
+this will turn shape (1,3) into (3)
+
+#### unsqueeze
+
+unsqueeze will add a dimension of size 1 at the given indexed
+
+syntax:
+```cpp
+Ctensor.unsqueeze(size_t dim);
+```
+
+example:
+```cpp
+auto CTensor_instance = SplineNetLib::CTensor({1,2,3},{3});
+CTensor_instance.unsqueeze(0);
+```
+
+this turns CTensor with shape (3) to CTensor with shape (1,3)
+
+#### expand
+
+expand can increase the size of the selected dimension by a factor n. The data at the seoected dimension will be copied and appended n times.
+
+syntax:
+```cpp
+Ctensor.unsqueeze(size_t dim, int factor);
+```
+
+example:
+```cpp
+auto CTensor_instance = SplineNetLib::CTensor({1,2,3},{1,3});
+CTensor_instance.expand(0, 3);
+```
+
+the shape (1,3) becomes (3,3) and the data
+
+((1,2,3)) becomes ↓
+
+((1,2,3),  
+ (1,2,3),  
+ (1,2,3))
+
+#### permute 
+
+swaps around dimension sizes 
+
+syntax:
+
+syntax:
+```cpp
+Ctensor.permute(index_vector);
+```
+example:
+```cpp
+auto CTensor_instance = SplineNetLib::CTensor({1,2,1,2,1,2,1,2},{1,4,2});
+std::vector<size_t> index_vector = {0,2,1};
+```
+
+the shape (1,4,2) will become (1,2,4). **Note** that this will not change the actual data vector as the permutation only affects the projection logic, meaning that when indexing a permutated CTensor the result will be different to before the permutation although the underlaying data is the same.
+
+#### transpose
+
+this transposes the CTensor meaning it swaps the inner most two dimensions (including the data in the flat vector)
+
+syntax:
+
+```cpp
+Ctensor.transpose();
+```
+
+example: 
+
+```cpp
+auto CTensor_instance = SplineNetLib::CTensor({1.0,2.0,3.0,4.0,5.0,6.0}, {2,3});
+
+CTensor_instance.transpose();
+```
+
+this will swap dim0 and dim1, so shape (2,3) becomes (3,2). The data vector [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] will change to [1.0, 4.0, 2.0, 5.0, 3.0, 6.0] to fit the new shape.
+
+
+
+**more coming soon**
+
+[<- back to Documentation](../README.md)
\ No newline at end of file
diff --git a/docs/README.md b/docs/cpp_splines.md
similarity index 50%
rename from docs/README.md
rename to docs/cpp_splines.md
index 3dbbb03..f25a2fb 100644
--- a/docs/README.md
+++ b/docs/cpp_splines.md
@@ -1,41 +1,5 @@
-# About Spline-based-DeepLearning
+### CPP Splines Documentation
 
-## Table of contents
-
-[goals](#goals)
-
-[C++ documentation](#C-Implementationdocumentation)
-
-1. [splines](#splines)
-2. [layers](#layers)
-3. [network](#Network)
-
-[python documentation](#python-Implementationdocumentation)
-
-1. [splines](#splines-2)
-2. [layers](#layer-documentation-comming-soon)
-
-## New:
-
-* python version of the spline and layer classes
-
-see [install for python](#install-for-python) to install
-
-* batch compatibility for layers 
-
-**documentation was not yet updated some features might have changed and new features were added**
-
-**updates will follow soon**
-
-## goals
-
-1. create visual representations for neural networks by replacing commonly used fully connected layers with spline based layers.
-2. achieve similar or better precision to common deep learning approaches whilst keeping the structure as light-wheight and fast as possible.
-3. allow easy adaptability to existing architectures like convolutional and recurrent networks.
-
-## C++ Implementation/documentation
-
-### Splines
 The splines are the main computation unit of a layer. They allow for an easily adjustable and visualizable alternative to wheight matricies.
 To create a spline call:
 ```cpp
@@ -76,6 +40,7 @@ double loss_grad = spline.backward(x,d_y,lr)
 * double lr = learning rate
 
 ### layers
+
 A layer uses splines as substitution for wheight and bias matricies.
 Layers are implemented similar to torch.nn.linear();
 To create a new layer call:
@@ -124,7 +89,7 @@ vector<vector<double>> pred = layer_instance.forward(X, normalize);
 
 **assuming namespace std**
 ```cpp
-vector<double> loss_gradient = layer_instance(X,d_y);
+vector<double> loss_gradient = layer_instance.backward(X,d_y);
 ```
 
 * vector<double> X = input (either from previous layer or from dataset)
@@ -133,7 +98,7 @@ vector<double> loss_gradient = layer_instance(X,d_y);
 
 - batched backward pass:
 ```cpp
-vector<vector<double>> loss_gradient = layer_instance(X, d_y);
+vector<vector<double>> loss_gradient = layer_instance.backward(X, d_y);
 ```
 
 * vector<vector<double>> X = batched input (either from previous layer or from dataset)
@@ -179,149 +144,4 @@ std::vector<double> loss_gradient = network_instance.backward(X,d_y)
 
 (when using the manual approach meaning iterating manually over layers to apply activations you have to do the backward pass manually aswell.)
 
-## python Implementation/documentation
-
-### import
-
-```python
-import PySplineNetLib as some_name
-```
-
-### Splines
-Splines are the main computation unit for this approach, they esentially provide a easily visualizable alterp to wheight matricies
-
-- spline creation:
-```python
-spline_instance = PySplineNetLib.spline(points,parameters)
-```
-* points : list = list of points like (num points, 2)
-* parameters : list = list of parameters like (num points - 1, 4)
-
-**full example**
-
-```python
-points : list = [[0.0,0.0],[0.5,0.25],[1.0,1.0]]
-parameters : list = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]]
-
-spline_instance = PySplineNetLib.spline(points,parameters)
-```
-
-- spline interpolation:
-
-to properly init a spline call .interpolation()
-
-```python
-spline_instance.interpolation()
-```
-
-this ensures that the parameters are properly set for evaluation and training
-
-- spline forward pass / evaluation:
-
-to evaluate the spline at x call
-
-```python
-y : float = spline_instance.forward(x)
-```
-
-x : float = point to be evaluated
-
-- spline backward / gradient propagation:
-
-to find the splines gradient based on a give loss grad at spline point (x,y) call
-
-```python
-d_y : float = spline_instance.backward(x, d_y, y)
-```
-x : float = point that was last evaluated
-y : float = spline prediction at x
-d_y : float = gradient of loss with (x,target) with respect to spline (x,y) (=> loss.backward() or d_y of next layer)
-
-**Note :**
-
-The gradient of this function call is internally stored in the spline.
-
-- adjust spline based on gradient
-
-to apply the gradient from .backward and adjust the spline call:
-```python
-spline_instance.apply_grad(lr)
-```
-
-lr : float = learning rate (controls how strong the gradient affects the splines points)
-
-### layer documentation comming soon
-
-
-
-## install for c++
-
-```txt
-git clone https://github.com/K-T0BIAS/Spline-based-DeepLearning.git
-cd Spline-based-DeepLearning
-mkdir build
-cd build
-cmake ..
-make
-make install or make install DESTDIR=/path_to_desired_directory
-```
-to run the example : ./SplineNetExample
-
-## include
-
-in .cpp:
-```cpp
-#include "SplineNetLib/Network.hpp"
-```
-
-in the projects cmake:
-```txt
-cmake_minimum_required(VERSION 3.10)
-project(YourProjectDirectory)
-
-# Set C++ standard
-set(CMAKE_CXX_STANDARD 17)
-
-# Find SplineNetLib package
-find_package(SplineNetLib REQUIRED)
-
-# Add executable and link with SplineNetLib
-add_executable(YourProjectDirectory main.cpp)
-target_link_libraries(YourProjectDirectory PRIVATE SplineNetLib)
-```
- 
-or in terminal:
-```txt
-g++ -std=c++17 -I/path_to_include -L/path_to_lib -lSplineNetLib main.cpp -o YourProjectDirectory 
-```
-* Replace /path_to_include with the path to the installed include directory.
-
-* Replace /path_to_lib with the path where libSplineNetLib.a is located.
-
-## install for python
-
-**Note this only includes splines and layer, no Network class**
-
-**REQUIRED: pybind11, setuptools, wheel (if not already install these with pip)**
-
-```txt
-git clone https://github.com/K-T0BIAS/Spline-based-DeepLearning.git
-cd Spline-based-DeepLearning
-mkdir -p build
-cd build
-cmake ..
-make
-cd ..
-pip install .
-```
-
-
-## License
-
-This project is licensed under the Mozilla Public License 2.0. 
-
-Copyright (c) 2024 Tobias Karusseit. See the [LICENSE](./LICENSE) file for details.
-
-This project also uses `pybind11`, which is licensed under the MIT License. See [pybind11 GitHub](https://github.com/pybind/pybind11) for more details.
-
-This project also uses `Catch2`, which is licensed under the Boost Software License 1.0. See [Catch2 GitHub](https://github.com/catchorg/Catch2) for more details.
\ No newline at end of file
+[<- back to  Documentation](../README.md)
\ No newline at end of file
diff --git a/docs/py_splines.md b/docs/py_splines.md
new file mode 100644
index 0000000..97a396d
--- /dev/null
+++ b/docs/py_splines.md
@@ -0,0 +1,146 @@
+
+## python Spline Implementation/documentation
+
+### import
+
+```python
+import PySplineNetLib as some_name
+```
+
+### Splines
+Splines are the main computation unit for this approach, they esentially provide a easily visualizable alterp to wheight matricies
+
+- spline creation:
+```python
+spline_instance = PySplineNetLib.spline(points,parameters)
+```
+* points : list = list of points like (num points, 2)
+* parameters : list = list of parameters like (num points - 1, 4)
+
+**full example**
+
+```python
+points : list = [[0.0,0.0],[0.5,0.25],[1.0,1.0]]
+parameters : list = [[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]]
+
+spline_instance = PySplineNetLib.spline(points,parameters)
+```
+
+or alternatively do:
+
+```python
+spline_instance = PySplineNetLib.spline([[0.0,0.0],[0.5,0.25],[1.0,1.0]],[[0.0]*4]*2)
+```
+
+- spline interpolation:
+
+to properly init a spline call .interpolation()
+
+```python
+spline_instance.interpolation()
+```
+
+this ensures that the parameters are properly set for evaluation and training
+
+- spline forward pass / evaluation:
+
+to evaluate the spline at x call
+
+```python
+y : float = spline_instance.forward(x)
+```
+
+x : float = point to be evaluated
+
+- spline backward / gradient propagation:
+
+to find the splines gradient based on a give loss grad at spline point (x,y) call
+
+```python
+d_y : float = spline_instance.backward(x, d_y, y)
+```
+x : float = point that was last evaluated
+
+y : float = actual target 
+
+d_y : float = gradient of loss with (x,target) with respect to spline (x,y) (=> loss.backward() or d_y of next layer)
+
+**Note :**
+
+The gradient of this function call is internally stored in the spline.
+
+- adjust spline based on gradient
+
+to apply the gradient from .backward and adjust the spline call:
+```python
+spline_instance.apply_grad(lr)
+```
+
+lr : float = learning rate (controls how strong the gradient affects the splines points)
+
+## layer
+
+layers combine multiple splines to map an input vector of size m to an output vector of size n by evaluating splines at the input values and combining these outputs into the output. To achieve this the layer uses an m x n spline matrix where for every input<sub>i</sub> there exist n splines. 
+
+mathematically the output $y$ is defined like this:
+
+$$
+y_j = \sum_{i=1}^{m} S_{i,j}(x_i), \quad \forall j \in \{1, \dots, n\}
+$$
+
+for example given input size 3 and output size 2, output<sub>1</sub> is the sum of splines<sub>i,1</sub> with i from 0 to 3 (input size)
+
+To create a new layer do:
+
+```python
+layer_instance = PySplineNetLib.layer(input_size, output_size, detail, max)
+```
+
+where:
+
+input_size : int = the size of the input vector
+output_size : int = the expected size of the output vector
+detail : int = the number of controlpoints for ecah spline (NOTE that the spline has detail + 2 points so to get 10 points detail shouod be 8)
+max : float = the maximum value that any spline in the layer can evaluate (recomended 1.0 combined with activations that map input and output to range(0,1))
+
+alternatively you can create a spline with start values for points and parameters like this:
+
+```python
+spline_instance = PySplineNetLib(points, parameters)
+```
+
+with:
+points : list = nested list of points like : (input_size, output_size, detail +2, 2 = x,y) 
+parameters : list = nested list of points like : (input_size, output_size, detail +1, 4)
+
+to fully init the layer call:
+
+```python
+layer_instance.interpolate_splines()
+```
+
+### forward pass
+
+```python
+pred = layer_instance.forward(X)
+```
+
+where
+
+X : list = single input vector or batched input vector
+pred : list = prediction vector (also with batch dimension if the input was batched)
+
+### backward pass
+
+```python
+d_y = layer_instance.backward(X, d_y)
+```
+
+where:
+
+X is the last inputvthis layer recieved
+d_y is the propagated gradient of the previous layer
+
+Note that backward will apply the gradient to all splines in the layer automatically
+
+[<- back to Documentation](../README.md)
\ No newline at end of file
diff --git a/examples/example_CTensor.cpp b/examples/example_CTensor.cpp
new file mode 100644
index 0000000..aa87409
--- /dev/null
+++ b/examples/example_CTensor.cpp
@@ -0,0 +1,23 @@
+#include "../include/SplineNetLib/layers.hpp"
+
+using namespace SplineNetLib;
+
+int main() {
+    
+    //this will create a CTensor that holds a data vector and shape vector, all other member variables are uninitialized
+    auto a = CTensor({1,1,1,2,2,2},{2,3});
+    
+    std::cout<<"created CTensor a with data : "<<vectorToString(a.data())<<" and shape : "<<vectorToString(a.shape())<<"\n";
+    
+    std::vector<std::vector<int>> data_b({{1,1,1},{2,2,2}});
+    
+    auto b = CTensor<int>(data_b);
+    
+    std::cout<<"created CTensor b with data : "<<vectorToString(b.data())<<" and shape : "<<vectorToString(b.shape())<<"\n";
+    
+    auto c = a + b;
+    
+    std::cout<<"created CTensor c by adding a + b. Data : "<<vectorToString(c.data())<<" and shape : "<<vectorToString(c.shape())<<"\n";
+    
+    return 0;
+}
\ No newline at end of file
diff --git a/include/SplineNetLib/CTensor.hpp b/include/SplineNetLib/CTensor.hpp
new file mode 100644
index 0000000..6e5b087
--- /dev/null
+++ b/include/SplineNetLib/CTensor.hpp
@@ -0,0 +1,176 @@
+// Copyright (c) <2025>, <Tobias Karusseit>
+// 
+// This file is part of the PySplineNetLib project, which is licensed under the 
+// Mozilla Public License, Version 2.0 (MPL-2.0).
+// 
+// SPDX-License-Identifier: MPL-2.0
+// For the full text of the licenses, see:
+// - Mozilla Public License 2.0: https://opensource.org/licenses/MPL-2.0
+
+
+
+
+#ifndef CTENSOR_HPP
+#define CTENSOR_HPP
+
+#include "CTensorFunc.hpp"
+
+namespace SplineNetLib {
+
+template<typename T>
+requires Scalar<T>
+class Function;
+
+template<Scalar T>
+class DTensor{
+public: 
+    std::vector<T> _data;
+    std::vector<size_t> _shape;
+    std::vector<T> _grad;
+    std::vector<std::unique_ptr<Function<T>>> _grad_fn;
+    int _ref_c;
+    
+    DTensor(const std::vector<T>& data, const std::vector<size_t>& shape) : 
+    _data(data), _shape(shape), _ref_c(1) {}
+    
+    DTensor(const std::initializer_list<T>& data, const std::initializer_list<size_t>& shape) : 
+    _data(data), _shape(shape), _ref_c(1) {}
+    
+    DTensor(const DTensor<T>& other) : _data(other._data), _shape(other._shape), _grad(other._grad), _ref_c(1) {
+        // Deep copy unique_ptrs to grad fns by calling clone()
+        for (const auto& fn : other._grad_fn) {
+            _grad_fn.push_back(fn ? fn->clone() : nullptr);
+        }
+    }
+    
+    void add_ref(){
+        _ref_c++;
+    }
+    
+    void rmf_ref(){
+        _ref_c--;
+        if (_ref_c == 0){
+            delete this;
+        }
+    }
+};
+
+
+template<Scalar T>
+class CTensor { 
+private:
+    
+    CTensor(DTensor<T>* _t_data) : _tensor_data(_t_data){}
+    
+public:
+
+    DTensor<T>* _tensor_data;
+    
+    bool requires_grad = true;
+        
+    CTensor(const std::initializer_list<T>& init, const std::initializer_list<size_t>& shape) {
+        _tensor_data = new DTensor(init, shape);
+    }
+    
+    
+    CTensor(const std::vector<T>& data, const std::vector<size_t>& shape) {
+        _tensor_data = new DTensor(data, shape);
+    }
+    
+    template<Container U>
+    CTensor(const U& data) {
+        _tensor_data = new DTensor(Flatten<T>(data), get_shape(data));
+    }
+    
+    CTensor(const CTensor<T>& other){
+        _tensor_data = other._tensor_data;
+        _tensor_data->_ref_c++;
+    }
+    
+    
+    
+    
+    ~CTensor(){
+        _tensor_data->rmf_ref();
+    }
+    
+    //-----getters-----
+    
+    std::vector<T> data() const { return this->_tensor_data->_data; }
+    
+    std::vector<size_t> shape() const { return this->_tensor_data->_shape; }
+    
+    std::vector<T> grad() const { return this->_tensor_data->_grad; }
+    
+    std::vector<std::unique_ptr<Function<T>>> grad_fn() const { return this->_tensor_data->grad_fn; }
+    
+    void zero_grad();
+    
+    CTensor<T> clone();
+    
+    //-----shape-utils-----
+    
+    void squeeze(const size_t &dim) ;//squeezes / removes the input dim and changes the internal projection shape
+    
+    void unsqueeze(const size_t &dim) ; //adds a new dim at the input dim
+    
+    void expand(const size_t &dim, const size_t &factor) ; //expands the dimension by factor so that shape 3,2 expanded(1,3) 
+                                                           //becomes: (3,6) (will duplicate values at the dimension to match new projected shape)
+    
+    void permute(const std::vector<size_t> &permutation_indecies) ; //will swap dimesnions at the permutation indecies 
+                                                                    //shape (2,3,4) permute(2,0,1) becomes: (4,2,3) 
+    
+    void reduce(const size_t &dim, const size_t &factor) ; 
+    
+    void transpose() ;
+    
+    //-----auto_grad-----
+    //delete all grad fns of this 
+    void clear_history() ;
+    //recursive delete of grad fns for all tensors in the graph with this as root
+    void clear_graph() ;
+    //maybe add overload o this so that f no arg was passed propagated grad is set to {}, than this function below could use all by ref
+    void backward(std::vector<T> prop_grad = {}) ;
+    
+    
+    //-----operator-----
+    
+    auto operator[](size_t idx) ;
+    
+    auto operator+(CTensor<T> &other) ;
+    
+    auto operator-(CTensor<T> &other) ;
+    
+    auto operator*(CTensor<T> &other) ;
+    
+    //CTensor<T>& operator=(const CTensor<T> &other) noexcept;
+    
+    //CTensor<T>& operator=(CTensor<T> &&other) ;
+
+    
+};
+/*
+template<Scalar T>
+CTensor<T> zeros(std::vector<size_t> shape) ;
+
+template<Scalar T>
+CTensor<T> ones(std::vector<size_t> shape) ;
+
+template<Scalar T>
+CTensor<T> random(std::vector<size_t> shape, T min, T max) ;
+
+template<typename T ,Container U>
+CTensor<T> Tensor(U data) ;
+
+template<typename T, Scalar U>
+CTensor<T> Tensor(U data) ;
+
+template<typename T>
+CTensor<T> Tensor(std::vector<T> data, std::vector<size_t> shape) ;
+*/
+} //namespace
+
+#include "../src/CTensor.tpp"
+
+
+#endif
\ No newline at end of file
diff --git a/include/SplineNetLib/CTensorFunc.hpp b/include/SplineNetLib/CTensorFunc.hpp
new file mode 100644
index 0000000..8f86a3e
--- /dev/null
+++ b/include/SplineNetLib/CTensorFunc.hpp
@@ -0,0 +1,140 @@
+// Copyright (c) <2025>, <Tobias Karusseit>
+// 
+// This file is part of the PySplineNetLib project, which is licensed under the 
+// Mozilla Public License, Version 2.0 (MPL-2.0).
+// 
+// SPDX-License-Identifier: MPL-2.0
+// For the full text of the licenses, see:
+// - Mozilla Public License 2.0: https://opensource.org/licenses/MPL-2.0
+
+
+
+
+#ifndef CTENSORFUNC_HPP
+#define CTENSORFUNC_HPP
+
+#include "CTensorUtils.hpp"
+
+namespace SplineNetLib {
+    
+typedef enum {
+    RESHAPE_SQUEEZE = 1,
+    RESHAPE_UNSQUEEZE  = 2,
+    RESHAPE_EXPAND = 3,
+    RESHAPE_REDUCE = 4,
+    RESHAPE_PERMUTE = 5,
+    RESHAPE_TRANSPOSE = 6
+} ReshapeType;
+
+
+template<Scalar T>
+class CTensor;
+
+//base function class for specialization
+template<typename T>
+requires Scalar<T>
+class Function {
+public:
+    //pointers to this functions "parents" (like : a operator b)
+    std::shared_ptr<CTensor<T>> a;
+    std::shared_ptr<CTensor<T>> b;
+    std::vector<size_t> a_shape;
+    std::vector<size_t> b_shape;
+    
+    Function(std::shared_ptr<CTensor<T>> A, std::shared_ptr<CTensor<T>> B) : a(A), b(B), 
+               /*nullptr check for A and B to ensure no segfaults happen ->*/a_shape(A ? A->shape() : std::vector<size_t> {1}), 
+                                                                             b_shape(B ? B->shape() : std::vector<size_t> {1}) {}
+    
+    //virtual desctructor
+    virtual ~Function() = default;
+    
+    virtual std::vector<T> fwd() = 0;
+    
+    virtual void backward(std::vector<T> &prop_grad, CTensor<T> *result) = 0;
+    
+    virtual std::unique_ptr<Function<T>> clone() const = 0;
+    
+    static std::unordered_set<Function<T>*> global_chain;
+    
+    void clear_graph_f();
+};
+
+template<typename T>
+requires Scalar<T>
+std::unordered_set<Function<T>*> Function<T>::global_chain;
+
+//addition class for CTensor<T>::operator+
+template<typename T>
+requires Scalar<T>
+class AddFunction : public Function<T> {
+public:
+
+    //construct base class
+    AddFunction(std::shared_ptr<CTensor<T>> a, std::shared_ptr<CTensor<T>> b) : Function<T>(a, b) {}
+    
+    std::vector<T> fwd() override ;
+    
+    void backward(std::vector<T> &prop_grad, CTensor<T> *result) override;
+    
+    virtual std::unique_ptr<Function<T>> clone() const override;
+};
+
+//subtractor function class for CTensor<T>::operator-
+template<typename T>
+requires Scalar<T>
+class SubFunction : public Function<T> {
+public:
+
+    //construct base class
+    SubFunction(std::shared_ptr<CTensor<T>> a, std::shared_ptr<CTensor<T>> b) : Function<T>(a, b) {}
+    
+    std::vector<T> fwd() override;
+    
+    void backward(std::vector<T> &prop_grad, CTensor<T> *result) override;
+    
+    virtual std::unique_ptr<Function<T>> clone() const override;
+    
+};
+
+//matrix multiplication function class for CTensor<T>::operator*
+template<typename T>
+requires Scalar<T>
+class MatMulFunction : public Function<T> {
+public:
+
+    //construct base class
+    MatMulFunction(std::shared_ptr<CTensor<T>> a, std::shared_ptr<CTensor<T>> b) : Function<T>(a, b) {}
+    
+    std::vector<T> fwd() override;
+    
+    void backward(std::vector<T> &prop_grad, CTensor<T> *result) override;
+    
+    virtual std::unique_ptr<Function<T>> clone() const override;
+};
+
+template<typename T>
+requires Scalar<T>
+class ReShapeFunction : public Function<T> {
+public :
+    
+    ReshapeType operation;
+    /*
+    std::vector<size_t> original_shape;
+    std::vector<size_t> new_shape;
+    */
+    
+    ReShapeFunction(std::shared_ptr<CTensor<T>> a, ReshapeType _operation) : 
+    Function<T>(a, nullptr),operation(_operation){}
+    
+    std::vector<T> fwd() override;
+    
+    void backward(std::vector<T> &prop_grad, CTensor<T> *result) override;
+    
+    virtual std::unique_ptr<Function<T>> clone() const override;
+};
+
+} //namepace
+
+#include "../src/CTensorFunc.tpp"
+
+#endif
\ No newline at end of file
diff --git a/include/SplineNetLib/CTensorUtils.hpp b/include/SplineNetLib/CTensorUtils.hpp
new file mode 100644
index 0000000..ea4f4b3
--- /dev/null
+++ b/include/SplineNetLib/CTensorUtils.hpp
@@ -0,0 +1,99 @@
+// Copyright (c) <2025>, <Tobias Karusseit>
+// 
+// This file is part of the PySplineNetLib project, which is licensed under the 
+// Mozilla Public License, Version 2.0 (MPL-2.0).
+// 
+// SPDX-License-Identifier: MPL-2.0
+// For the full text of the licenses, see:
+// - Mozilla Public License 2.0: https://opensource.org/licenses/MPL-2.0
+
+
+
+
+#ifndef CTENSORUTILS_HPP
+#define CTENSORUTILS_HPP
+
+#include <iostream>
+#include <vector>
+#include <type_traits>
+#include <iterator>
+#include <concepts>
+#include <functional>
+#include <sstream>
+#include <string>
+#include <any>
+#include <random>
+#include <unordered_set>
+
+namespace SplineNetLib {
+    
+
+template <typename T>
+std::string vectorToString(const std::vector<T>& vec);
+
+template <typename T>
+concept Container = requires(T t) {
+    typename T::value_type;             // Requires a nested `value_type` (if T::value_type fails T is not a Container)
+    typename T::iterator;               // Requires a nested `iterator`
+    typename T::const_iterator;         // Also requires a nested `const_iterator` for const containers
+    { t.begin() } -> std::input_iterator;  // Requires a `begin()` function that has return type std::input_iterator
+    { t.end() } -> std::input_iterator;    // Requires an `end()` function that has return type std::input_iterator
+    { t.size() } -> std::convertible_to<std::size_t>; //also requires a `size()` function thatvhas return type std::convertible_to<std::size_t>
+};
+
+template <typename T>
+concept Scalar = std::is_arithmetic_v<T>; // Requires T to be is_arithmetic_v
+
+// Function to generate a std::vector<T> with random values
+template <Scalar T>
+std::vector<T> randomVector(size_t size, T min, T max) ;
+
+
+//base case for recursive n_dims check
+template <Scalar T>
+int get_depth(const T &scalar) ;
+
+//Recursive case for the n_dims check will return the number of dimensions od the input
+template<Container T>
+int get_depth (const T &vec) ;
+
+//base Recursive case for the get_shape func will return the shape
+template <Scalar T>
+std::vector<size_t> get_shape(const T &scalar, std::vector<size_t> Shape = {}) ;
+
+//Recursive function to get shape of container (assumes uniform dims) pushes back the size of the container at current recursion depth
+template <Container T>
+std::vector<size_t> get_shape(const T &vec, std::vector<size_t> Shape = {}) ;
+
+//base case if input is scalar type (will in place push back to the result)
+template<typename U, Scalar T>
+void Flatten(const T &in_scalar, std::vector<U> &result) ;
+
+//Recursive case will move down one dim into the input and recursively call itself for all "values" in input
+template<typename U, Container T>
+void Flatten(const T &in_vector, std::vector<U> &result) ;
+
+// Flatten controll function will create the result variable and initialize the recursion
+template<typename U, typename T>
+std::vector<U> Flatten(const T& in_vector) ;
+    
+// calculate the stride length to get to next index in dim forvthe projected vector
+size_t stride(size_t idx, const std::vector<size_t> &shape) ;
+
+//math -------------------
+
+template<typename T>
+std::vector<T> matmul(const std::vector<T> &A, const std::vector<T> &B, const std::vector<size_t> &A_shape, const std::vector<size_t> &B_shape) ;
+
+template<typename T>
+requires Scalar<T>
+std::vector<T> permute_vec(const std::vector<T>& A, const std::vector<size_t>& A_shape, const std::vector<size_t>& permutation_indices) ;
+
+//swaps last two dimensions as if transposing a ctensor
+std::vector<size_t> transpose_shape(const std::vector<size_t>& shape) ;
+
+} //namespace
+
+#include "../src/CTensorUtils.tpp"
+
+#endif
\ No newline at end of file
diff --git a/include/SplineNetLib/splines.hpp b/include/SplineNetLib/splines.hpp
index 76a315c..c8b7577 100644
--- a/include/SplineNetLib/splines.hpp
+++ b/include/SplineNetLib/splines.hpp
@@ -17,6 +17,7 @@
 #include <vector>
 #include <stdexcept>
 #include <cmath>
+#include "CTensor.hpp"
 /*
 #include <thread>
 #include <mutex>
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..9c5a55d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=42", "wheel", "pybind11"]
+build-backend = "setuptools.build_meta"
\ No newline at end of file
diff --git a/setup.py b/setup.py
index af120ca..f6f7032 100644
--- a/setup.py
+++ b/setup.py
@@ -22,6 +22,7 @@
 import os
 import subprocess
 import pybind11
+import sys
 
 def build_cpp_library():
     #make sure cmake is installed
@@ -62,6 +63,7 @@ def build_python_extension():
                 libraries=["SplineNetLib"],  # Link with your precompiled library
                 library_dirs=[get_library_path()],  # Directory containing the precompiled library
                 language="c++",  # Ensure it's compiled as C++
+                extra_compile_args=["-std=c++20"],
             )
         ],
         install_requires=[
diff --git a/src/CTensor.tpp b/src/CTensor.tpp
new file mode 100644
index 0000000..376ac1f
--- /dev/null
+++ b/src/CTensor.tpp
@@ -0,0 +1,375 @@
+// Copyright (c) <2025>, <Tobias Karusseit>
+// 
+// This file is part of the PySplineNetLib project, which is licensed under the 
+// Mozilla Public License, Version 2.0 (MPL-2.0).
+// 
+// SPDX-License-Identifier: MPL-2.0
+// For the full text of the licenses, see:
+// - Mozilla Public License 2.0: https://opensource.org/licenses/MPL-2.0
+
+
+
+
+
+#ifndef CTENSOR_TPP
+#define CTENSOR_TPP
+
+
+#include "../include/SplineNetLib/CTensor.hpp"
+
+namespace SplineNetLib {
+
+template<Scalar T>
+void CTensor<T>::zero_grad(){
+    this->_tensor_data->_grad = std::vector(this->_tensor_data->_data.size(),static_cast<T>(0));
+}
+
+
+template<Scalar T>
+void CTensor<T>::squeeze(const size_t& dim) {
+    auto n_dims = this->_tensor_data->_shape.size();
+    if (n_dims == 1) {
+        throw std::invalid_argument("CTensor with 1 Dim can not be squeezed to be 0D\n");
+    } else if (dim >= n_dims) {
+        throw std::invalid_argument("target Dim: "+std::to_string(dim)+"is out of range of CTensor with n_dims: "+std::to_string(n_dims)+"\n");
+    } else if (dim == n_dims-1){
+        this->_tensor_data->_shape[dim-1] *= this->_tensor_data->_shape[dim];
+        this->_tensor_data->_shape.pop_back();
+    } else {
+        this->_tensor_data->_shape[dim] *= this->_tensor_data->_shape[dim+1];
+        this->_tensor_data->_shape.erase(this->_tensor_data->_shape.begin() + dim + 1);
+    }
+    
+    if (this->requires_grad) {
+        auto new_fn = std::make_unique<ReShapeFunction<T>>(std::make_shared<CTensor<T>>(*this), RESHAPE_SQUEEZE);
+        
+        this->_tensor_data->_grad_fn.push_back(std::move(new_fn));
+    }
+}
+    
+template<Scalar T>
+void CTensor<T>::unsqueeze(const size_t &dim) {
+    auto n_dims = this->_tensor_data->_shape.size();
+    auto* shape = &(this->_tensor_data->_shape);//make a temp ptr to the shape vector for easier syntax
+    if (dim > n_dims) {
+        (*shape).push_back(1);
+    } else {
+        (*shape).insert((*shape).begin() + dim, 1);
+    }
+    
+    if (this->requires_grad) {
+        auto new_fn = std::make_unique<ReShapeFunction<T>>(std::make_shared<CTensor<T>>(*this), RESHAPE_UNSQUEEZE);
+        
+        this->_tensor_data->_grad_fn.push_back(std::move(new_fn));
+    }
+}
+
+template<Scalar T>
+void CTensor<T>::expand(const size_t &dim, const size_t &factor) {
+    if (factor <= 1) {
+        return; // No expansion needed
+    }
+    
+    auto* shape = &(this->_tensor_data->_shape);//make a temp ptr to the shape vector for easier syntax
+    auto* data = &(this->_tensor_data->_data);
+    auto n_dims = (*shape).size();
+    
+    
+    // Check if the specified dimension is valid
+    if (dim >= n_dims) {
+        throw std::invalid_argument("Input dim: " + std::to_string(dim) + " cannot be larger than _n_dims: " + std::to_string(n_dims));
+    }
+    
+    // Calculate the size of the sub-vectors to repeat
+    size_t sub_vector_size = 1;
+    for (size_t i = dim + 1; i < n_dims; i++) {
+        sub_vector_size *= (*shape)[i];
+    }
+    
+    size_t data_size_per_expansion = (*shape)[dim] * sub_vector_size;
+        
+    // Repeat the data by the specified factor
+    size_t idx = 0;
+    while (idx < (*data).size()) {
+        std::vector<T> sub_vector((*data).begin() + idx, (*data).begin() + idx + data_size_per_expansion);
+    
+        // Insert the sub-vector factor times
+        for (size_t i = 1; i < factor; i++) {
+            (*data).insert((*data).begin() + idx, sub_vector.begin(), sub_vector.end());
+            idx += data_size_per_expansion;
+        }
+            
+        idx += data_size_per_expansion;
+    }
+    
+    auto new_shape = (*shape);
+    new_shape[dim] *= factor;
+    
+        //create new addfunction with shared ptr to this and other
+    auto new_fn = std::make_unique<ReShapeFunction<T>>(std::make_shared<CTensor<T>>(*this), RESHAPE_EXPAND);
+    
+        // Update the shape and number of dimensions
+    (*shape)[dim] *= factor;
+    
+    this->_tensor_data->_grad_fn.push_back(std::move(new_fn));
+
+}
+
+template <Scalar T>
+void CTensor<T>::reduce(const size_t &dim, const size_t &factor) {
+    if (factor <= 1) {
+        return; // No reduction needed
+    }
+
+    auto* shape = &(this->_tensor_data->_shape); // Pointer to shape vector
+    auto* data = &(this->_tensor_data->_data);
+    size_t n_dims = shape->size();
+
+    // Ensure valid dimension
+    if (dim >= n_dims) {
+        throw std::invalid_argument("Input dim: " + std::to_string(dim) + 
+                                    " cannot be larger than _n_dims: " + std::to_string(n_dims));
+    }
+
+    // Ensure the shape is divisible by factor
+    if ((*shape)[dim] % factor != 0) {
+        return;
+    }
+
+    // Calculate the size of sub-vectors
+    size_t sub_vector_size = 1;
+    for (size_t i = dim + 1; i < n_dims; i++) {
+        sub_vector_size *= (*shape)[i];
+    }
+
+    size_t idx = 0;
+    while (idx < data->size()) {
+        // Remove (factor - 1) repetitions of the sub-vector
+        for (size_t i = 1; i < factor; i++) {
+            data->erase(data->begin() + idx, data->begin() + idx + sub_vector_size);
+        }
+        idx += sub_vector_size;  // Move to the next section after all removals
+    }
+
+    (*shape)[dim] /= factor;
+}
+
+template<Scalar T>
+void CTensor<T>::permute(const std::vector<size_t> &permutation_indecies) {
+    //renamed global func permute to permute_vec so that func czll in class is nolonger ::permute now permute_vec
+    this->_tensor_data->_data = permute_vec(this->_tensor_data->_data, this->_tensor_data->_shape, permutation_indecies); 
+        
+    auto shape_copy = this->shape();
+    for (size_t i = 0; i < permutation_indecies.size(); i++) {
+        this->_tensor_data->_shape[i] = shape_copy[permutation_indecies[i]];
+    }
+}
+
+template<Scalar T>
+void CTensor<T>::transpose() {
+    if (this->_tensor_data->_shape.size()>=2) {
+        std::vector<size_t> transpose_idx;
+        for (size_t i = 0; i < this->_tensor_data->_shape.size()-2; i++) {
+            transpose_idx.push_back(i);
+        }
+            
+        transpose_idx.push_back(this->_tensor_data->_shape.size() - 1);
+        transpose_idx.push_back(this->_tensor_data->_shape.size() - 2);
+            
+        this->permute(transpose_idx);
+    } 
+}
+
+
+
+//-----operator-----/
+template<Scalar T>
+auto CTensor<T>::operator[](size_t idx){
+    std::vector<size_t> Shape = this->shape();
+    //check if index should exist in multi dim space
+    if (idx >= Shape[0]) {
+        throw std::invalid_argument("index ["+std::to_string(idx)+"] is out of range with dim of size : "+std::to_string(Shape[0])+"\n");
+    }
+    //if vector is 1D to begin with
+    if (Shape.size() == 1) {
+        //create sub vector with scalar data at data[idx]
+        std::vector<typename decltype(this->data())::value_type> sub_vector = {this->data()[idx]};
+        //std::cout<<"operator[] scalar case debug data[idx]="<<sub_vector[0]<<"\n"; //debug
+        Shape = {1};  // now just a scalar (still packed in a vector but treated as scalar)
+        return CTensor(sub_vector, Shape);
+    }
+            
+    //remove first dim from Shape as sub vector only uses the later dims
+    Shape.erase(Shape.begin());
+    size_t size_sub_vector = 1;
+    //calculate the projected size of the sub tensor in 1D spcae
+    for (const size_t& dim : Shape) {
+        size_sub_vector *= dim;
+    }
+            
+    //projected index to 1D
+    size_t flat_idx = idx * size_sub_vector;
+    //to avoid exessivevcalls to this->data() in range constructor (could likely also be used in decltype)
+    auto data = this->data();
+    //creates a vector of same type as stored in CTensor using range constructor from flat_idx to flat_idx + size_sub_vector
+    std::vector<typename decltype(this->data())::value_type> sub_vector(data.begin() + flat_idx, data.begin() + flat_idx + size_sub_vector);
+        
+        
+    auto new_CT = CTensor(sub_vector, Shape);
+    return new_CT;
+}
+
+template<Scalar T>
+auto CTensor<T>::operator+(CTensor<T>& other){
+    //create new addfunction with shared ptr to this and other
+    auto new_fn = std::make_unique<AddFunction<T>>(std::make_shared<CTensor<T>>(*this),
+                                                    std::make_shared<CTensor<T>>(other));
+    auto res_vec = new_fn->fwd(); //add this data and other data
+    auto result = CTensor(res_vec, this->shape());//create the result CTensor 
+    if (this->requires_grad || other.requires_grad) {
+        result.requires_grad = true;
+        
+        result._tensor_data->_grad_fn.push_back(std::move(new_fn));
+    } else {
+        result.requires_grad = false;
+    }
+    return result;
+}
+
+
+template<Scalar T>
+auto CTensor<T>::operator-(CTensor<T> &other) {
+    //create new SubFunction with shared ptr to this and other
+    auto new_fn = std::make_unique<SubFunction<T>>(std::make_shared<CTensor<T>>(*this),
+                                                   std::make_shared<CTensor<T>>(other));
+    auto res_vec = new_fn->fwd();
+    auto result = CTensor<T>(res_vec, this->shape());
+    if (this->requires_grad || other.requires_grad) {
+        result.requires_grad = true;
+        
+        result._tensor_data->_grad_fn.push_back(std::move(new_fn));
+    } else {
+        result.requires_grad = false;
+    }
+    return result;
+}
+
+template<Scalar T>
+auto CTensor<T>::operator* (CTensor<T> &other) {
+    //create the parent function for the result using parents this and other
+    //this will make a shared ptr of the base class. this works since the functions in tje derived classes are all overrides 
+    //this is doen so that all grad fns of a CTensor can be stored in the same std::vector<shared_ptr<Function<T>>> _grad_fn
+    auto new_fn = std::make_unique<MatMulFunction<T>>(std::make_shared<CTensor<T>>(*this),
+                                                     std::make_shared<CTensor<T>>(other));
+    //use new_fn.forward() to perfo5m the addition
+    auto res_vec = new_fn->fwd();
+    
+    std::vector<size_t> result_shape;
+    auto this_shape = this->shape();
+    auto other_shape = other.shape();
+    for (size_t i = 0; i < this_shape.size() -1; i++){
+        result_shape.push_back(this_shape[i]);
+    }
+    result_shape.push_back(other_shape[other_shape.size()-1]);
+        
+    auto result = CTensor<T>(res_vec, result_shape);
+    //assign parent function to the result._grad_fn
+    if (this->requires_grad || other.requires_grad) {
+        result.requires_grad = true;
+        
+        result._tensor_data->_grad_fn.push_back(std::move(new_fn));
+    } else {
+        result.requires_grad = false;
+    }
+    
+    return result;
+}
+
+
+
+
+
+template<Scalar T>
+void CTensor<T>::clear_history() {
+    this->_tensor_data->_grad_fn.clear();
+    //this should be safe since Function uses pointers to Ctensor and the Tensor will survive the _grad_fn clear
+}
+
+template<Scalar T>
+void CTensor<T>::clear_graph() {
+    //recursive call to traverse grad graph 
+    for (auto &fn : this->_tensor_data->_grad_fn) {
+
+        fn->clear_graph_f();
+    }
+    //clear this CTensor history when sub graph is cleared
+    this->clear_history();
+}
+
+//can be improved with overload if no arg is passe to use {} so that this function below can use refernces
+template<Scalar T>
+void CTensor<T>::backward(std::vector<T> prop_grad) {
+    /*
+    //go through all parent Functions
+    for (auto &fn : this->_tensor_data->_grad_fn) {
+        if (fn) {
+            //std::cout<<sizeof(fn)<<"\n";
+            //cast fn to Function type (not done before bc circular dependencies between Ctensor and Function)
+            //call backward on fn and pass prop_grad as propagated gradient and 'this' as result ptr (result = child of parent func)
+            //std::cout<<"debug Ct bwd fn bwd call\n";
+            fn->backward(prop_grad, this);
+            //std::cout<<"debug Ct bwd fn bwd finish\n";
+        }
+    }
+    //std::cout<<"debug Ct bwd fn all bwd finish\n";
+    */
+    //testing with revers as this makes more sense fir the tree traversal
+    for (int i = this->_tensor_data->_grad_fn.size() - 1; i >= 0; i--){
+        if (this->_tensor_data->_grad_fn[i]){
+            this->_tensor_data->_grad_fn[i]->backward(prop_grad, this);
+        }
+    }
+}
+
+template<Scalar T>
+CTensor<T> CTensor<T>::clone() {
+    CTensor<T> Cloned_CTensor(new DTensor<T>(*_tensor_data));
+    return Cloned_CTensor;
+}
+/* untestee
+template<Scalar T>
+CTensor<T> zeros(std::vector<size_t> shape) {
+    std::vector<T> data(stride(-1,shape),T(0));
+    return CTensor<T>(data, shape);
+}
+
+template<Scalar T>
+CTensor<T> ones(std::vector<size_t> shape) {
+    std::vector<T> data(stride(-1,shape),T(1));
+    return CTensor<T>(data, shape);
+}
+
+template<Scalar T>
+CTensor<T> random(std::vector<size_t> shape, T min, T max) {
+    return CTensor<T>(randomVector<T>(stride(-1,shape),min,max),shape);
+}
+    
+template<typename T,Container U>
+CTensor<T> Tensor(U data) {
+    return CTensor<T>(data);
+}
+
+template<typename T, Scalar U>
+CTensor<T> Tensor(U data) {
+    return CTensor<T>(data);
+}
+
+template<typename T>
+CTensor<T> Tensor(std::vector<T> data, std::vector<size_t> shape) {
+    return CTensor<T>(data,shape);
+}
+*/
+
+} //namespace
+
+#endif
\ No newline at end of file
diff --git a/src/CTensorFunc.tpp b/src/CTensorFunc.tpp
new file mode 100644
index 0000000..10b5df8
--- /dev/null
+++ b/src/CTensorFunc.tpp
@@ -0,0 +1,368 @@
+// Copyright (c) <2025>, <Tobias Karusseit>
+// 
+// This file is part of the PySplineNetLib project, which is licensed under the 
+// Mozilla Public License, Version 2.0 (MPL-2.0).
+// 
+// SPDX-License-Identifier: MPL-2.0
+// For the full text of the licenses, see:
+// - Mozilla Public License 2.0: https://opensource.org/licenses/MPL-2.0
+
+
+
+
+#ifndef CTENSORFUNC_TPP
+#define CTENSORFUNC_TPP
+
+
+#include "../include/SplineNetLib/CTensorFunc.hpp"
+
+namespace SplineNetLib {
+
+template<typename T>
+requires Scalar<T>
+void Function<T>::clear_graph_f() {
+    a->clear_graph();
+    b->clear_graph();
+}
+
+template<typename T>
+requires Scalar<T>
+std::vector<T> AddFunction<T>::fwd() {
+    
+    auto* a_data = &(this->a->_tensor_data->_data);
+    auto* b_data = &(this->b->_tensor_data->_data);
+    
+    T l;
+    T r;
+    
+    std::vector<T> res_vec;
+    for (size_t i = 0; i < (*a_data).size() || i < (*b_data).size(); i++){
+        l = (i < (*a_data).size()) ? (*a_data)[i] : 0 ;
+        r = (i < (*b_data).size()) ? (*b_data)[i] : 0 ;        
+        res_vec.push_back(l + r);
+    }
+    return res_vec;
+}
+    
+
+
+template<typename T>
+requires Scalar<T>
+void AddFunction<T>::backward(std::vector<T> &prop_grad, CTensor<T> *result) {
+    //std::cout<<"debug add bwd call\n";
+    //check if func already exists in the recursive chain
+    if (Function<T>::global_chain.find(this) != Function<T>::global_chain.end()) {
+        std::cout<<"cyle detected in grad backward, ensure no incorrect reassignments to to Ctensors that were previously used in the computation graph\n";
+        return;
+    }
+    //std::cout<<"debug add bwd cycle check\n";
+    //insert this func into the chain for cycle detection
+    Function<T>::global_chain.insert(this);
+    //std::cout<<"debug add bwd chain insert\n";
+    if (prop_grad.empty()){
+        for (size_t i=0; i < this->a->data().size(); i++) {
+            prop_grad.push_back(1);
+            //std::cout<<"debug add bwd empty grad set to 1s \n";
+        }
+    }
+    //std::cout<<"debug add bwd grad add\n";
+    //ensure self dependend gradients arent added twice
+    if (result != this->a.get()) {
+        //std::cout<<"debug add bwd this->a gradient propagation initialized\n";
+        //std::cout<<"debug add bwd this a grad size:"<<this->a->grad().size()<<"prop_grad size: "<<prop_grad.size()<<"\n";
+        if (this->a->requires_grad == true) {
+            if (this->a->grad().empty()){
+                //std::cout<<"a grqd empty "<<this->a->grad().size()<<"\n";
+                this->a->zero_grad();
+            }
+            //std::cout<<"working on grad of a at "<<this->a<<" "<<vectorToString(this->a->grad())<<" "<<vectorToString(prop_grad)<<"\n";
+            for (size_t i = 0; i < prop_grad.size(); i++) {
+                
+                this->a->_tensor_data->_grad[i] += prop_grad[i];
+                //std::cout<<"debug add bwd accumulation step\n";
+            }
+        }
+        //std::cout<<"debug add bwd this a grad accumulated\n";
+        this->a->backward(prop_grad);
+        //std::cout<<"debug add bwd this a recursion finished\n";
+    }
+    //ensure self dependend gradients arent added twice
+    if (result != this->b.get()) {
+        //std::cout<<"debug add bwd this->b gradient propagation initialized\n";
+        //std::cout<<"debug add bwd this b grad size:"<<this->b->grad().size()<<"prop_grad size: "<<prop_grad.size()<<"\n";
+        if (this->b->requires_grad == true) {
+            if (this->b->grad().empty()){
+                //std::cout<<"b grqd empty "<<this->b->grad().size()<<"\n";
+                this->b->zero_grad();
+            }
+            //std::cout<<"working on grad of b at "<<this->b<<" "<<vectorToString(this->b->grad())<<" "<<vectorToString(prop_grad)<<"\n";
+            for (size_t i = 0; i < prop_grad.size(); i++) {
+                
+                this->b->_tensor_data->_grad[i] += prop_grad[i];
+                //std::cout<<"debug add bwd accumulation step\n";
+            }
+        }
+        //std::cout<<"debug add bwd this b grad accumulated\n";
+        this->b->backward(prop_grad);
+        //std::cout<<"debug add bwd this b recursion finished\n";
+    }
+    //std::cout<<"debug add bwd recursive propagation\n";
+    //remove this func from the chain if all its recursive processes finished
+    Function<T>::global_chain.erase(this);
+    //std::cout<<"debug add bwd chain erase\n";
+}
+
+template<typename T>
+requires Scalar<T>
+std::unique_ptr<Function<T>> AddFunction<T>::clone() const {
+    return std::make_unique<AddFunction<T>>(*this);
+}
+
+
+template<typename T>
+requires Scalar<T>
+std::vector<T> SubFunction<T>::fwd() {
+    
+    auto* a_data = &(this->a->_tensor_data->_data);
+    auto* b_data = &(this->b->_tensor_data->_data);
+    
+    T l;
+    T r;
+    
+    std::vector<T> res_vec;
+    for (size_t i = 0; i < (*a_data).size() || i < (*b_data).size(); i++){
+        l = (i < (*a_data).size()) ? (*a_data)[i] : 0 ;
+        r = (i < (*b_data).size()) ? (*b_data)[i] : 0 ;        
+        res_vec.push_back(l - r);
+    }
+    return res_vec;
+}
+
+
+template<typename T>
+requires Scalar<T>
+void SubFunction<T>::backward(std::vector<T> &prop_grad, CTensor<T> *result) {
+    
+    //check if func already exists in the recursive chain
+    if (Function<T>::global_chain.find(this) != Function<T>::global_chain.end()) {
+        std::cout<<"cyle detected in  Ctensor.backward(), ensure no incorrect reassignments to to Ctensors that were previously used in the computation graph\n";
+        return;
+    }
+    
+    //insert this func into the chain for cycle detection
+    Function<T>::global_chain.insert(this);
+        
+    if (prop_grad.empty()){
+        for (size_t i=0; i < this->a->data().size(); i++) {
+            prop_grad.push_back(1);
+        }
+    }
+    
+    //ensure self dependend gradients arent added twice
+    if (result != this->a.get()) {
+        if (this->a->requires_grad == true) {
+            if (this->a->grad().empty()){
+                //std::cout<<"a grqd empty "<<this->a->grad().size()<<"\n";
+                this->a->zero_grad();
+            }
+            for (size_t i = 0; i < prop_grad.size(); i++) {
+                
+                this->a->_tensor_data->_grad[i] += prop_grad[i];
+                
+            }
+        }
+        this->a->backward(prop_grad);
+    }
+    //ensure self dependend gradients arent added twice
+    if (result != this->b.get()) {
+        if (this->b->requires_grad == true) {
+            if (this->b->grad().empty()){
+                //std::cout<<"b grqd empty "<<this->b->grad().size()<<"\n";
+                this->b->zero_grad();
+            }
+            for (size_t i = 0; i < prop_grad.size(); i++) {
+                this->b->_tensor_data->_grad[i] -= prop_grad[i];
+            }
+        }
+        this->b->backward(prop_grad);
+    }
+    //remove this func from the chain if all its recursive processes finished
+    Function<T>::global_chain.erase(this);
+}
+
+
+template<typename T>
+requires Scalar<T>
+std::unique_ptr<Function<T>> SubFunction<T>::clone() const {
+    return std::make_unique<SubFunction<T>>(*this);
+}
+
+template<typename T>
+requires Scalar<T>
+std::vector<T> MatMulFunction<T>::fwd() {
+    
+    std::vector<size_t> a_shape = this->a->shape();
+    std::vector<size_t> b_shape = this->b->shape();
+        
+    size_t a_n_dims = a_shape.size();
+    size_t b_n_dims = b_shape.size();
+    
+    auto a_copy = this->a->clone();
+    auto b_copy = this->b->clone();
+        
+    if (a_n_dims != b_n_dims) {
+        throw std::invalid_argument("operator (*) expects both opperants to have the same num of dimensions but got:"+std::to_string(a_n_dims)+"and "+std::to_string(b_n_dims)+",please ensure opperants dims match by using squeeze or unsqueeze beforehand\n");
+    }
+    if (a_n_dims > 2) {
+        //Create sub vectors for the batch dimensions
+        std::vector<size_t> a_batch_shape;
+        std::vector<size_t> b_batch_shape;
+        //get only the batch dimension shapes
+        for (size_t i = 0; i < a_shape.size()-2; i++ ){
+            a_batch_shape.push_back(a_shape[i]);
+            b_batch_shape.push_back(b_shape[i]);
+        }
+        for (size_t i = 0; i < a_batch_shape.size(); i++) {
+            //expand dims so that batch dimensions are the same
+            if (a_batch_shape[i] != b_batch_shape[i]) {
+                a_copy.expand(i,b_batch_shape[i]);
+                b_copy.expand(i,a_batch_shape[i]);
+            }
+        }
+    }
+    std::vector<T> result_vector = matmul(a_copy.data(), b_copy.data(), a_copy.shape(), b_copy.shape());
+    return result_vector;
+    
+}
+
+template<typename T>
+requires Scalar<T>
+void MatMulFunction<T>::backward(std::vector<T> &prop_grad, CTensor<T> *result) {
+    
+    //check if func already exists in the recursive chain
+    if (Function<T>::global_chain.find(this) != Function<T>::global_chain.end()) {
+        std::cout<<"cyle detected in  Ctensor.backward(), ensure no incorrect reassignments to to Ctensors that were previously used in the computation graph\n";
+        return;
+    }
+    
+        //insert this func into the chain for cycle detection
+    Function<T>::global_chain.insert(this);
+
+    
+    auto prop_grad_shape = result->shape();
+    //std::cout<<"matmul bwd prop shape : "<<vectorToString(prop_grad_shape)<<"\n";
+    
+
+    if (prop_grad.empty()){
+        for (size_t i=0; i < result->data().size(); i++) {
+            prop_grad.push_back(1);
+        }
+    }
+    
+    //ensure self dependend gradients arent added twice
+    if (result != this->a.get()) {
+        auto prop_grad_a = this->a->grad(); //needs to be deeply checked
+        if (this->a->requires_grad == true) {
+            if (this->a->_tensor_data->_grad.empty()){
+                //std::cout<<"a grqd empty "<<this->a->grad().size()<<"\n";
+                this->a->zero_grad();
+            }
+                    //create a copy of b and transpose it
+            auto b_copy = this->b->clone();
+            b_copy.transpose();
+            auto b_shape = transpose_shape(this->b_shape);
+            
+            prop_grad_a = matmul(prop_grad, b_copy.data(), prop_grad_shape, b_shape);
+            
+            //assign grad
+            for (size_t i = 0; i < prop_grad_a.size(); i++) {
+                this->a->_tensor_data->_grad[i] += prop_grad_a[i];
+            }
+        }
+        this->a->backward(prop_grad_a);
+    }
+    
+    //ensure self dependend gradients arent added twice
+    if (result != this->b.get()) {
+        auto prop_grad_b = this->b->grad();
+        if (this->b->requires_grad == true) {
+            if (this->b->_tensor_data->_grad.empty()){
+                //std::cout<<"b grad empty "<<this->b->grad().size()<<"\n";
+                this->b->zero_grad();
+            }
+                    //create a copy of b and transpose it
+            auto a_copy = this->a->clone();
+            a_copy.transpose();
+            auto a_shape = transpose_shape(this->a_shape);
+            //std::cout<<"b bwd a_copy shape :"<<vectorToString(a_copy.shape())<<" "<<vectorToString(prop_grad)<<"\n";
+            prop_grad_b = matmul(a_copy.data(), prop_grad, a_shape, prop_grad_shape);
+            
+            //assign grad
+            for (size_t i = 0; i < prop_grad_b.size(); i++) {
+                this->b->_tensor_data->_grad[i] += prop_grad_b[i];
+            }
+        }
+        this->b->backward(prop_grad_b);
+    }
+    
+    //remove this func from the chain if all its recursive processes finished
+    Function<T>::global_chain.erase(this);
+}
+
+template<typename T>
+requires Scalar<T>
+std::unique_ptr<Function<T>> MatMulFunction<T>::clone() const {
+    return std::make_unique<MatMulFunction<T>>(*this);
+}
+
+template<typename T>
+requires Scalar<T>
+std::vector<T> ReShapeFunction<T>::fwd() {
+    return this->a->data();
+}
+
+
+template<typename T>
+requires Scalar<T>
+void ReShapeFunction<T>::backward(std::vector<T> &prop_grad, CTensor<T> *result){
+    //std::cout<<"RESHAPEFUNCTION CALL\n";
+    
+    switch(this->operation) {
+        case RESHAPE_SQUEEZE:
+            if (result != this->a.get()){
+                this->a->backward(prop_grad);
+            }
+            break;
+        case RESHAPE_UNSQUEEZE:
+            if (result != this->a.get()){
+                this->a->backward(prop_grad);
+            }
+            break;
+        case RESHAPE_EXPAND: 
+            std::cout<<"\n\nWARNING: This CTensor was expanded in the computational graph, therefore gradients can not be calculated further in this branch\n\n";
+            break;
+        
+        case RESHAPE_REDUCE:
+            break;
+        case RESHAPE_PERMUTE:
+            
+            break;
+        case RESHAPE_TRANSPOSE:
+            if (result != this->a.get()){
+                this->a->backward(prop_grad);
+            }
+            break;
+        default: //should throw exeption
+            break;
+    }
+}
+
+template<typename T>
+requires Scalar<T>
+std::unique_ptr<Function<T>> ReShapeFunction<T>::clone() const{
+    return std::make_unique<ReShapeFunction<T>>(*this);
+}
+
+}//namespace
+
+#endif
\ No newline at end of file
diff --git a/src/CTensorUtils.tpp b/src/CTensorUtils.tpp
new file mode 100644
index 0000000..ca0b26a
--- /dev/null
+++ b/src/CTensorUtils.tpp
@@ -0,0 +1,181 @@
+// Copyright (c) <2025>, <Tobias Karusseit>
+// 
+// This file is part of the PySplineNetLib project, which is licensed under the 
+// Mozilla Public License, Version 2.0 (MPL-2.0).
+// 
+// SPDX-License-Identifier: MPL-2.0
+// For the full text of the licenses, see:
+// - Mozilla Public License 2.0: https://opensource.org/licenses/MPL-2.0
+
+
+
+
+#ifndef CTENSORUTILS_TPP
+#define CTENSORUTILS_TPP
+
+#include "../include/SplineNetLib/CTensorUtils.hpp"
+
+namespace SplineNetLib {
+
+template <Scalar T>
+std::vector<T> randomVector(size_t size, T min, T max) {
+    // Random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+
+    // Distribution depending on type T
+    typename std::conditional<std::is_integral<T>::value, 
+                              std::uniform_int_distribution<T>, 
+                              std::uniform_real_distribution<T>>::type dist(min, max);
+
+    std::vector<T> vec(size);
+    for (auto& v : vec) {
+        v = dist(gen);
+    }
+    return vec;
+}
+
+
+template <Scalar T>
+int get_depth(const T &scalar) {
+    return 0;
+}
+
+template <Container T>
+int get_depth(const T &vec) {
+    int max_depth = 1;
+    for (const auto &element : vec) {
+        max_depth = std::max(max_depth, 1 + get_depth(element));
+    }
+    return max_depth;
+}
+
+template<typename T>
+std::string vectorToString(const std::vector<T>& vec) {
+    std::ostringstream oss;
+    oss << "(";
+    for (size_t i = 0; i < vec.size(); ++i) {
+        oss << vec[i];
+        if (i < vec.size() - 1) {
+            oss << ", ";
+        }
+    }
+    oss << ")";
+    return oss.str();
+}
+
+template <Scalar T>
+std::vector<size_t> get_shape(const T &scalar, std::vector<size_t> Shape) {
+    return Shape;
+}
+
+template <Container T>
+std::vector<size_t> get_shape(const T &vec, std::vector<size_t> Shape) {
+    Shape.push_back(vec.size());
+    return get_shape(vec[0], Shape);
+}
+
+template<typename U, Scalar T>
+void Flatten(const T &in_scalar, std::vector<U> &result) {
+    result.push_back(in_scalar);
+}
+
+template<typename U, Container T>
+void Flatten(const T &in_vector, std::vector<U> &result) {
+    for (const auto &vec : in_vector) {
+        Flatten(vec, result);
+    }
+}
+
+template<typename U, typename T>
+std::vector<U> Flatten(const T& in_vector) {
+    std::vector<U> result;
+    Flatten(in_vector, result);
+    return result;
+}
+
+inline size_t stride(size_t idx, const std::vector<size_t> &shape) {
+    size_t stride = 1;
+    for (size_t i = idx + 1; i < shape.size(); i++) {
+        stride *= shape[i];
+    }
+    return stride;
+}
+
+//math funcs
+
+template<typename T>  // Template function that accepts any scalar type 'T' (e.g., float, double)
+requires Scalar<T>   // This constraint ensures that the type 'T' is a scalar (e.g., not a matrix, vector, etc.)
+std::vector<T> matmul(const std::vector<T> &A, const std::vector<T> &B, const std::vector<size_t> &A_shape, const std::vector<size_t> &B_shape) {
+    size_t batch_size = 1;  // Variable to store the number of batches (default to 1)
+    //std::cout<<"debug : matmul : a shape = "<<vectorToString(A_shape)<<" b shape = "<<vectorToString(B_shape)<<"\n";
+    // Ensure A and B have the same number of dimensions
+    if (B_shape.size() != A_shape.size()) {
+        throw std::invalid_argument("A_shape.size() and B_shape.size() must be equal");
+        return std::vector<T>(1, 0);  // This return statement is unreachable due to the exception, but just in case.
+    }
+    
+    
+    // If A has more than 2 dimensions (e.g., batching is involved), calculate the batch size
+    if (A_shape.size() > 2) {
+        for (size_t i = 0; i < A_shape.size() - 2; i++) {
+            batch_size *= A_shape[i];  // Multiply the sizes of the leading dimensions (batch dimensions)
+        }
+    }
+
+    // Get the dimensions for matrix multiplication
+    size_t M = A_shape[A_shape.size() - 2];  // Rows of A
+    size_t K = A_shape[A_shape.size() - 1];  // Columns of A and rows of B
+    size_t N = B_shape[B_shape.size() - 1];  // Columns of B
+
+    // Initialize the result vector with a size to hold all results (batch_size * M * N)
+    std::vector<T> result(batch_size * M * N);
+
+    // Perform matrix multiplication for each batch
+    for (size_t batch_dim = 0; batch_dim < batch_size; batch_dim++) {
+        for (size_t row = 0; row < M; row++) {  // Iterate over each row of A
+            for (size_t col = 0; col < N; col++) {  // Iterate over each column of B
+                T sum = 0.0;  // Initialize the sum for the current element in the result matrix
+                for (size_t shared = 0; shared < K; shared++) {  // Iterate over the shared dimension (columns of A, rows of B)
+                    // Perform the dot product between the row of A and the column of B
+                    sum += A[batch_dim * M * K + row * K + shared] * B[batch_dim * K * N + shared * N + col];
+                }
+                // Store the computed value in the result vector at the appropriate position
+                result[batch_dim * M * N + row * N + col] = sum;
+            }
+        }
+    }
+    return result;  // Return the final result of the matrix multiplication
+}
+
+template<typename T>
+requires Scalar<T>
+std::vector<T> permute_vec(const std::vector<T>& A, const std::vector<size_t>& A_shape, const std::vector<size_t>& permutation_indices) {
+    std::vector<T> B(A.size(), 0);
+    std::vector<size_t> B_shape;
+
+    for (const auto& idx : permutation_indices) {
+        B_shape.push_back(A_shape[idx]);
+    }
+
+    for (size_t i = 0; i < A.size(); i++) {
+        size_t idx = 0;
+        for (size_t k = 0; k < A_shape.size(); k++) {
+            idx += ((i / stride(permutation_indices[k], A_shape)) % B_shape[k]) * stride(k, B_shape);
+        }
+        B[idx] = A[i];
+    }
+    return B;
+}
+
+inline std::vector<size_t> transpose_shape(const std::vector<size_t>& shape) {
+    std::vector<size_t> temp = shape;
+    size_t n_dims = temp.size();
+    temp[n_dims - 2] = shape[n_dims - 1];
+    temp[n_dims - 1] = shape[n_dims - 2];
+    return temp;
+}
+
+}//namespace
+
+#endif
\ No newline at end of file
diff --git a/src/SplineNetLib_py.cpp b/src/SplineNetLib_py.cpp
index 9194507..674e15a 100644
--- a/src/SplineNetLib_py.cpp
+++ b/src/SplineNetLib_py.cpp
@@ -14,14 +14,57 @@
 // - Mozilla Public License 2.0: https://opensource.org/licenses/MPL-2.0
 // - MIT License: https://opensource.org/licenses/MIT
 
-#include <pybind11/pybind11.h>
 
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>  // To handle STL types like std::string, std::vector
+#include <pybind11/operators.h>
 #include "SplineNetLib/SplineNet.hpp"    // Header for the library
 
+
 namespace py = pybind11;
 
+// Function to handle nested Python lists and convert them to std::vector<U>
+template <typename U>
+void flatten_pylist(const py::handle &obj, std::vector<U> &result) {
+    if (py::isinstance<py::list>(obj)) {
+        for (const auto &item : obj.cast<py::list>()) {
+            flatten_pylist<U>(item, result);
+        }
+    } else {
+        result.push_back(obj.cast<U>());
+    }
+}
+
+// Wrapper function to create a new vector
+template <typename U>
+std::vector<U> convert_pylist_to_vector(const py::list &py_list) {
+    std::vector<U> result;
+    flatten_pylist<U>(py_list, result);
+    return result;
+}
+
+void get_shape_recursive(const py::list& py_list, std::vector<size_t>& shape) {
+    // Base case: when the list is empty, do nothing
+    if (py_list.size() == 0) return;
+
+    // Push the size of the current level
+    shape.push_back(py_list.size());
+
+    // Check if the first element is a list (nested)
+    if (py::isinstance<py::list>(py_list[0])) {
+        // Recursively call get_shape_recursive for nested lists
+        get_shape_recursive(py::cast<py::list>(py_list[0]), shape);
+    }
+}
+
+std::vector<size_t> get_shape(const py::list& py_list) {
+    std::vector<size_t> shape;
+    // Use the recursive get_shape implementation for vectors
+    get_shape_recursive(py_list, shape);
+    return shape;
+}
+
+
 PYBIND11_MODULE(PySplineNetLib, m) {
     py::class_<SplineNetLib::spline>(m, "spline")
         .def(py::init<const std::vector < std::vector < double>>&, const std::vector < std::vector < double>>& >())  // Bind constructor
@@ -31,8 +74,9 @@ PYBIND11_MODULE(PySplineNetLib, m) {
         .def("apply_grad",&SplineNetLib::spline::apply_grad,"None (double lr),apply grad from backward * lr")
         .def("get_points",&SplineNetLib::spline::get_points,"[[double]] (None),return spline points like [[x0,y0],...,[xn,yn]]")
         .def("get_params",&SplineNetLib::spline::get_params,"[[double]] (None),return spline parameters/coefficients like [[a0,b0,c0,d0],...,[an,bn,cn,dn]]");
+        
     py::class_<SplineNetLib::layer>(m, "layer")
-        .def(py::init<unsigned int, unsigned int, unsigned int, double>())
+        .def(py::init<unsigned int, unsigned int, unsigned int, double>())//in size, out size, detail (num of parameters -2), max (maximum input value that spline processes)
         .def(py::init<std::vector<std::vector<std::vector<std::vector<double>>>>, std::vector<std::vector<std::vector<std::vector<double>>>> >())
         .def("interpolate_splines",&SplineNetLib::layer::interpolate_splines,"None (None), calls interpolation on all splines in the layer")
         .def("forward",py::overload_cast<std::vector<double>, bool>(&SplineNetLib::layer::forward),"[double] ([double] x, bool normalize), forward call for single input sample")
@@ -40,56 +84,69 @@ PYBIND11_MODULE(PySplineNetLib, m) {
         .def("backward",py::overload_cast<std::vector<double>,std::vector<double> , bool>(&SplineNetLib::layer::backward),"[double] ([double] x,[double]d_y,bool normalize), takes input x, loss gradient d_y and bool apply_grad,returns propageted loss (applies grad to all splines if True)")
         .def("backward",py::overload_cast<const std::vector<std::vector<double>> &,std::vector<std::vector<double>> >(&SplineNetLib::layer::backward),"backward but for batches (will always apply gradients)")
         .def("get_splines",&SplineNetLib::layer::get_splines,"[[SplineNetLib::spline]] (None), returns all splines in the layer");
-}
+    //int tensor
+    py::class_<SplineNetLib::CTensor<int>>(m, "IntCTensor")
 
-/*to be checked
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>  // To handle STL types like std::string, std::vector
-#include "SplineNetLib/SplineNet.hpp"    // Header for the library
+        .def(py::init<const std::initializer_list<int>&, const std::initializer_list<size_t>&>())
+        .def(py::init<const std::vector<int>&, const std::vector<size_t>&>())
+        .def(py::init<const SplineNetLib::CTensor<int>&>())
+        .def(py::init([](const py::list &py_list) {//only for py module to turn nested lists and turn them to nested vector
+            auto nested_vector = convert_pylist_to_vector<int>(py_list);
+            std::vector<size_t> shape = get_shape(py_list);
+            return new SplineNetLib::CTensor<int>(nested_vector,shape); 
+        }))
+        .def("data",&SplineNetLib::CTensor<int>::data,"std::vector<int>, (None), returns the stored data vector as a copy")
+        .def("shape",&SplineNetLib::CTensor<int>::shape,"std::vector<size_t>, (None), returns the shape of the tensor like (dim0, dim1, ..., dimN)")
+        .def("grad",&SplineNetLib::CTensor<int>::grad, "std::vector<int>, (None), returns the grad as flat 1D projected vector (internally using tensor.shape)")
+        .def("zero_grad",&SplineNetLib::CTensor<int>::zero_grad, "None, (None), sets the gradient of this tensor to 0" )
+        .def("squeeze",&SplineNetLib::CTensor<int>::squeeze, "None, (size_t dim), removes the dim and projects the data to the new shape")
+        .def("unsqueeze",&SplineNetLib::CTensor<int>::unsqueeze, "None, (size_t dim), adds new dim at input dim index")
+        .def("expand",&SplineNetLib::CTensor<int>::expand, "None, (size_t dim, size_t factor), expands the dimesnion at dim by factor -> shape: (2,2) expand(0,3) becomes: shape(6,2), (note this WILL affect the data)")
+        .def("permute",&SplineNetLib::CTensor<int>::permute, "None, (std::vector<size_t>), swaps dimesnions at input indecies -> shape(2,1,3) permute([2,0,1] becomes: shape(3,2,1))")
+        .def("transpose",&SplineNetLib::CTensor<int>::transpose, "None, (None), transposes the tensor (swaps the innermost two dimesnions)")
+        .def("clear_history",&SplineNetLib::CTensor<int>::clear_history, "None, (None), clears all grad fns from the tensor (gradient propagatuon WILL NOT work after this so use carefully)")
+        .def("clear_graph",&SplineNetLib::CTensor<int>::clear_graph,"None, (None), clears full computational graph for all tensors conected to this one")
+        //.def("backward",&SplineNetLib::CTensor<int>::backward, "None, (None), backwards pass through this and connected graph")
+        .def("backward", &SplineNetLib::CTensor<int>::backward, 
+            py::arg("prop_grad") = std::vector<int>(), "Backward pass, takes an optional gradient vector (defaults to empty).")
+        .def("__mul__", [](SplineNetLib::CTensor<int>& self, SplineNetLib::CTensor<int>& other) {return self * other;})
+        .def("__add__", [](SplineNetLib::CTensor<int>& self, SplineNetLib::CTensor<int>& other) {return self + other; })
+        .def("__sub__", [](SplineNetLib::CTensor<int>& self, SplineNetLib::CTensor<int>& other) {return self - other; })
 
-namespace py = pybind11;
+        .def("__getitem__", [](SplineNetLib::CTensor<int>& self, size_t idx)->SplineNetLib::CTensor<int> { return self[idx]; });
+    
+    py::class_<SplineNetLib::CTensor<double>>(m, "CTensor")
 
-PYBIND11_MODULE(mylibrary, m) {
-    // Binding the spline class
-    py::class_<SplineNetLib::spline>(m, "spline")
-        .def(py::init<const std::vector<std::vector<double>>&, const std::vector<std::vector<double>>&>(),  // Constructor
-             "Constructs the spline with control points and parameters")
-        .def("interpolation", &SplineNetLib::spline::interpolation, 
-             "None -> Interpolates the spline based on its points")
-        .def("forward", &SplineNetLib::spline::forward, 
-             "double (double x) -> Evaluates the spline at x (if x is in bounds)")
-        .def("backward", &SplineNetLib::spline::backward,
-             "double (double in, double d_y, double out) -> Uses previous input, loss gradient, and last output for gradient descent")
-        .def("apply_grad", &SplineNetLib::spline::apply_grad,
-             "None (double lr) -> Applies gradient from backward * learning rate (lr)")
-        .def("get_points", &SplineNetLib::spline::get_points,
-             "[[double]] -> Returns spline points like [[x0, y0], ..., [xn, yn]]")
-        .def("get_params", &SplineNetLib::spline::get_params,
-             "[[double]] -> Returns spline parameters/coefficients like [[a0, b0, c0, d0], ..., [an, bn, cn, dn]]");
-
-    // Binding the layer class
-    py::class_<SplineNetLib::layer>(m, "layer")
-        .def(py::init<unsigned int, unsigned int, unsigned int, double>(),  // Constructor with size and learning rate
-             "Constructs a layer with the specified number of splines and learning rate")
-        .def(py::init<std::vector<std::vector<std::vector<std::vector<double>>>>, 
-                      std::vector<std::vector<std::vector<std::vector<double>>>>>(),  // Constructor for nested vector input
-             "Constructs a layer with nested vector inputs for spline initialization")
-        .def("interpolate_splines", &SplineNetLib::layer::interpolate_splines,
-             "None -> Calls interpolation on all splines in the layer")
-        
-        // Overloaded 'forward' methods
-        .def("forward", py::overload_cast<std::vector<double>, bool>(&SplineNetLib::layer::forward), 
-             "[double] (x, bool normalize) -> Forward call for single input sample, applies normalization if needed")
-        .def("forward", py::overload_cast<const std::vector<std::vector<double>>&, bool>(&SplineNetLib::layer::forward), 
-             "[[double]] (x, bool normalize) -> Forward call for batch inputs, applies normalization if needed")
+        .def(py::init<const std::initializer_list<double>&, const std::initializer_list<size_t>&>())
+        .def(py::init<const std::vector<double>&, const std::vector<size_t>&>())
+        .def(py::init<const SplineNetLib::CTensor<double>&>())
+        .def(py::init([](const py::list &py_list) {//only for py module to turn nested lists and turn them to nested vector
+            auto nested_vector = convert_pylist_to_vector<double>(py_list);
+            std::vector<size_t> shape = get_shape(py_list);
+            return new SplineNetLib::CTensor<double>(nested_vector,shape); 
+        }))
+        .def("data",&SplineNetLib::CTensor<double>::data,"std::vector<int>, (None), returns the stored data vector as a copy")
+        .def("shape",&SplineNetLib::CTensor<double>::shape,"std::vector<size_t>, (None), returns the shape of the tensor like (dim0, dim1, ..., dimN)")
+        .def("grad",&SplineNetLib::CTensor<double>::grad, "std::vector<int>, (None), returns the grad as flat 1D projected vector (internally using tensor.shape)")
+        .def("zero_grad",&SplineNetLib::CTensor<double>::zero_grad, "None, (None), sets the gradient of this tensor to 0" )
+        .def("squeeze",&SplineNetLib::CTensor<double>::squeeze, "None, (size_t dim), removes the dim and projects the data to the new shape")
+        .def("unsqueeze",&SplineNetLib::CTensor<double>::unsqueeze, "None, (size_t dim), adds new dim at input dim index")
+        .def("expand",&SplineNetLib::CTensor<double>::expand, "None, (size_t dim, size_t factor), expands the dimesnion at dim by factor -> shape: (2,2) expand(0,3) becomes: shape(6,2), (note this WILL affect the data)")
+        .def("permute",&SplineNetLib::CTensor<double>::permute, "None, (std::vector<size_t>), swaps dimesnions at input indecies -> shape(2,1,3) permute([2,0,1] becomes: shape(3,2,1))")
+        .def("transpose",&SplineNetLib::CTensor<double>::transpose, "None, (None), transposes the tensor (swaps the innermost two dimesnions)")
+        .def("clear_history",&SplineNetLib::CTensor<double>::clear_history, "None, (None), clears all grad fns from the tensor (gradient propagatuon WILL NOT work after this so use carefully)")
+        .def("clear_graph",&SplineNetLib::CTensor<double>::clear_graph,"None, (None), clears full computational graph for all tensors conected to this one")
+        //.def("backward",&SplineNetLib::CTensor<int>::backward, "None, (None), backwards pass through this and connected graph")
+        .def("backward", &SplineNetLib::CTensor<double>::backward,
+            py::arg("prop_grad") = std::vector<double>(),  "None, (None), backwards pass through this and connected graph")
+        .def("__mul__", [](SplineNetLib::CTensor<double>& self, SplineNetLib::CTensor<double>& other) {return self * other;})
+        .def("__add__", [](SplineNetLib::CTensor<double>& self, SplineNetLib::CTensor<double>& other) {return self + other; })
+        .def("__sub__", [](SplineNetLib::CTensor<double>& self, SplineNetLib::CTensor<double>& other) {return self - other; })
+
+        .def("__getitem__", [](SplineNetLib::CTensor<double>& self, size_t idx)->SplineNetLib::CTensor<double> { return self[idx]; });
         
-        // Overloaded 'backward' methods
-        .def("backward", py::overload_cast<std::vector<double>, std::vector<double>, bool>(&SplineNetLib::layer::backward),
-             "[double] (x, d_y, bool normalize) -> Backward propagation for single input sample, applies grad if normalize is True")
-        .def("backward", py::overload_cast<const std::vector<std::vector<double>>&, std::vector<std::vector<double>>>(&SplineNetLib::layer::backward),
-             "[[double]] (x, d_y) -> Backward propagation for batch inputs, always applies gradients")
         
-        .def("get_splines", &SplineNetLib::layer::get_splines,
-             "[[SplineNetLib::spline]] -> Returns all splines in the layer");
 }
-*/
\ No newline at end of file
+
+
+
diff --git a/tests/unit_tests/py_spline_tests.py b/tests/unit_tests/py_spline_tests.py
new file mode 100644
index 0000000..508454c
--- /dev/null
+++ b/tests/unit_tests/py_spline_tests.py
@@ -0,0 +1,63 @@
+import PySplineNetLib
+import unittest
+
+class Spline_Test(unittest.TestCase):
+    
+    def test_Spline_init_Test(self):
+        A = PySplineNetLib.spline([[0,0],[0.5,1],[1,2]],[[0,0,0,0],[0,0,0,0]])
+        A.interpolation()
+        a : float = A.forward(0.25)
+        self.assertAlmostEqual(0.5, a, delta = 0.000001)
+        a_y : float = A.backward(0.25, 0, 1)
+        #returns A.forward(0.25)=0.5 - y = 0 + d_y = 0 -> 0.5 - 1 = -0.5
+        self.assertAlmostEqual(-0.5, a_y, delta = 0.000001)
+        A.apply_grad(1) #applies the gradient with factor 1.0 (moves y_i at x_i > 0.25 by -1 *grad {same as sign(grad)})
+        A.interpolation() #fimds new params for the new spline
+        self.assertListEqual([[0.0, 0.5, 0.0, 2.0], [0.5, 2.0, 3.0, -2.0]], A.get_params())
+        self.assertListEqual([[0.0, 0.0], [0.5, 0.5], [1.0, 2.0]], A.get_points())
+
+class CTensor_Test(unittest.TestCase):
+    
+    def test_CTensor_init_Test(self):
+        a = PySplineNetLib.CTensor([[1,2,3],[4,5,6]])
+        self.assertListEqual([1,2,3,4,5,6], a.data())
+        self.assertListEqual([2,3], a.shape())
+        b = PySplineNetLib.CTensor([6,5,4,3,2,1],[3,2])
+        self.assertListEqual([6,5,4,3,2,1], b.data())
+        self.assertListEqual([3,2], b.shape())
+        c = PySplineNetLib.CTensor(a)
+        self.assertListEqual([1,2,3,4,5,6], c.data())
+        self.assertListEqual([2,3], c.shape())
+        
+    def test_CTensor_math_Test(self):
+        a = PySplineNetLib.CTensor([[1,2,3],[4,5,6]])
+        b = PySplineNetLib.CTensor([[6,5,4],[3,2,1]])
+        
+        c = a + b;
+        self.assertListEqual([7,7,7,7,7,7], c.data())
+        self.assertListEqual([2,3], c.shape())
+        
+        b.transpose()
+        d = a * b;
+        self.assertListEqual([28.0, 10.0, 73.0, 28.0], d.data())
+        self.assertListEqual([2,2], d.shape())
+        
+        b.transpose()
+        e = a - b;
+        self.assertListEqual([-5.0, -3.0, -1.0, 1.0, 3.0, 5.0], e.data())
+        self.assertListEqual([2,3], e.shape())
+        
+    def test_Ctensor_grad_Test(self):
+        a = PySplineNetLib.CTensor([[2,2,2],[2,2,2]])
+        b = PySplineNetLib.CTensor([[1,2],[3,4],[5,6]])
+        c = PySplineNetLib.CTensor([[0.5,0.5],[0.5,0.5]])
+        d = a * b + c
+        self.assertListEqual([18.5, 24.5, 18.5, 24.5],d.data())
+        self.assertListEqual([2,2],d.shape())
+        d.backward()
+        self.assertListEqual([3.0, 7.0, 11.0, 3.0, 7.0, 11.0], a.grad())
+        self.assertListEqual([4.0, 4.0, 4.0, 4.0, 4.0, 4.0], b.grad())
+        self.assertListEqual([1.0, 1.0, 1.0, 1.0], c.grad())
+        
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file